direct io write support

Summary:
rocksdb direct io support

```
[gzh@dev11575.prn2 ~/rocksdb] ./db_bench -benchmarks=fillseq --num=1000000
Initializing RocksDB Options from the specified file
Initializing RocksDB Options from command-line flags
RocksDB:    version 5.0
Date:       Wed Nov 23 13:17:43 2016
CPU:        40 * Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz
CPUCache:   25600 KB
Keys:       16 bytes each
Values:     100 bytes each (50 bytes after compression)
Entries:    1000000
Prefix:    0 bytes
Keys per prefix:    0
RawSize:    110.6 MB (estimated)
FileSize:   62.9 MB (estimated)
Write rate: 0 bytes/second
Compression: Snappy
Memtablerep: skip_list
Perf Level: 1
WARNING: Assertions are enabled; benchmarks unnecessarily slow
------------------------------------------------
Initializing RocksDB Options from the specified file
Initializing RocksDB Options from command-line flags
DB path: [/tmp/rocksdbtest-112628/dbbench]
fillseq      :       4.393 micros/op 227639 ops/sec;   25.2 MB/s

[gzh@dev11575.prn2 ~/roc
Closes https://github.com/facebook/rocksdb/pull/1564

Differential Revision: D4241093

Pulled By: lightmark

fbshipit-source-id: 98c29e3
main
Aaron Gao 8 years ago committed by Facebook Github Bot
parent 989e644ed8
commit 972f96b3fb
  1. 13
      db/c.cc
  2. 10
      db/db_impl.cc
  3. 14
      db/db_test.cc
  4. 16
      db/db_test_util.cc
  5. 2
      db/db_test_util.h
  6. 2
      db/forward_iterator_bench.cc
  7. 9
      examples/rocksdb_option_file_example.ini
  8. 6
      include/rocksdb/c.h
  9. 38
      include/rocksdb/env.h
  10. 32
      include/rocksdb/options.h
  11. 6
      include/rocksdb/table.h
  12. 110
      include/rocksdb/utilities/env_librados.h
  13. 7
      include/rocksdb/utilities/env_mirror.h
  14. 12
      java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
  15. 79
      java/rocksjni/env_options.cc
  16. 106
      java/rocksjni/options.cc
  17. 31
      java/src/main/java/org/rocksdb/DBOptions.java
  18. 33
      java/src/main/java/org/rocksdb/DBOptionsInterface.java
  19. 29
      java/src/main/java/org/rocksdb/Options.java
  20. 15
      java/src/test/java/org/rocksdb/DBOptionsTest.java
  21. 9
      java/src/test/java/org/rocksdb/EnvOptionsTest.java
  22. 17
      java/src/test/java/org/rocksdb/OptionsTest.java
  23. 26
      port/win/env_win.cc
  24. 8
      port/win/env_win.h
  25. 96
      port/win/io_win.cc
  26. 255
      port/win/io_win.h
  27. 10
      tools/db_bench_tool.cc
  28. 5
      tools/db_bench_tool_test.cc
  29. 2
      util/aligned_buffer.h
  30. 16
      util/db_options.cc
  31. 2
      util/db_options.h
  32. 2
      util/env.cc
  33. 184
      util/env_posix.cc
  34. 34
      util/file_reader_writer.cc
  35. 6
      util/file_reader_writer.h
  36. 10
      util/file_reader_writer_test.cc
  37. 48
      util/io_posix.cc
  38. 28
      util/io_posix.h
  39. 3
      util/log_write_bench.cc
  40. 4
      util/memenv.cc
  41. 4
      util/mock_env.cc
  42. 14
      util/options.cc
  43. 2
      util/options_helper.cc
  44. 6
      util/options_helper.h
  45. 2
      util/options_settable_test.cc
  46. 6
      util/options_test.cc
  47. 3
      util/testutil.cc
  48. 5
      utilities/backupable/backupable_db.cc
  49. 6
      utilities/env_mirror.cc

@ -1719,9 +1719,14 @@ void rocksdb_options_set_manifest_preallocation_size(
void rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t* opt, void rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t* opt,
unsigned char v) {} unsigned char v) {}
void rocksdb_options_set_allow_os_buffer(rocksdb_options_t* opt, void rocksdb_options_set_use_direct_reads(rocksdb_options_t* opt,
unsigned char v) { unsigned char v) {
opt->rep.allow_os_buffer = v; opt->rep.use_direct_reads = v;
}
void rocksdb_options_set_use_direct_writes(rocksdb_options_t* opt,
unsigned char v) {
opt->rep.use_direct_writes = v;
} }
void rocksdb_options_set_allow_mmap_reads( void rocksdb_options_set_allow_mmap_reads(
@ -2000,7 +2005,7 @@ rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t *limiter) { void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t *limiter) {
if (limiter->rep) { if (limiter->rep) {
delete limiter->rep; delete limiter->rep;
} }
delete limiter; delete limiter;
} }

@ -255,11 +255,17 @@ static Status ValidateOptions(
"More than four DB paths are not supported yet. "); "More than four DB paths are not supported yet. ");
} }
if (db_options.allow_mmap_reads && !db_options.allow_os_buffer) { if (db_options.allow_mmap_reads && db_options.use_direct_reads) {
// Protect against assert in PosixMMapReadableFile constructor // Protect against assert in PosixMMapReadableFile constructor
return Status::NotSupported( return Status::NotSupported(
"If memory mapped reads (allow_mmap_reads) are enabled " "If memory mapped reads (allow_mmap_reads) are enabled "
"then os caching (allow_os_buffer) must also be enabled. "); "then direct I/O reads (use_direct_reads) must be disabled. ");
}
if (db_options.allow_mmap_writes && db_options.use_direct_writes) {
return Status::NotSupported(
"If memory mapped writes (allow_mmap_writes) are enabled "
"then direct I/O writes (use_direct_writes) must be disabled. ");
} }
return Status::OK(); return Status::OK();

@ -3396,19 +3396,21 @@ TEST_F(DBTest, TableOptionsSanitizeTest) {
TEST_F(DBTest, MmapAndBufferOptions) { TEST_F(DBTest, MmapAndBufferOptions) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.allow_os_buffer = false; options.use_direct_reads = true;
options.allow_mmap_reads = true; options.allow_mmap_reads = true;
ASSERT_NOK(TryReopen(options)); ASSERT_NOK(TryReopen(options));
// All other combinations are acceptable // All other combinations are acceptable
options.allow_os_buffer = true; options.use_direct_reads = false;
ASSERT_OK(TryReopen(options)); ASSERT_OK(TryReopen(options));
options.allow_os_buffer = false; if (IsDirectIOSupported()) {
options.allow_mmap_reads = false; options.use_direct_reads = true;
ASSERT_OK(TryReopen(options)); options.allow_mmap_reads = false;
ASSERT_OK(TryReopen(options));
}
options.allow_os_buffer = true; options.use_direct_reads = false;
ASSERT_OK(TryReopen(options)); ASSERT_OK(TryReopen(options));
} }
#endif #endif

@ -478,6 +478,22 @@ Status DBTestBase::TryReopen(const Options& options) {
return DB::Open(options, dbname_, &db_); return DB::Open(options, dbname_, &db_);
} }
bool DBTestBase::IsDirectIOSupported() {
EnvOptions env_options;
env_options.use_mmap_writes = false;
env_options.use_direct_writes = true;
std::string tmp = TempFileName(dbname_, 999);
Status s;
{
unique_ptr<WritableFile> file;
s = env_->NewWritableFile(tmp, &file, env_options);
}
if (s.ok()) {
s = env_->DeleteFile(tmp);
}
return s.ok();
}
Status DBTestBase::Flush(int cf) { Status DBTestBase::Flush(int cf) {
if (cf == 0) { if (cf == 0) {
return db_->Flush(FlushOptions()); return db_->Flush(FlushOptions());

@ -692,6 +692,8 @@ class DBTestBase : public testing::Test {
Status TryReopen(const Options& options); Status TryReopen(const Options& options);
bool IsDirectIOSupported();
Status Flush(int cf = 0); Status Flush(int cf = 0);
Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()); Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions());

@ -330,7 +330,7 @@ int main(int argc, char** argv) {
options.compaction_style = rocksdb::CompactionStyle::kCompactionStyleNone; options.compaction_style = rocksdb::CompactionStyle::kCompactionStyleNone;
options.level0_slowdown_writes_trigger = 99999; options.level0_slowdown_writes_trigger = 99999;
options.level0_stop_writes_trigger = 99999; options.level0_stop_writes_trigger = 99999;
options.allow_os_buffer = false; options.use_direct_writes = true;
options.write_buffer_size = FLAGS_memtable_size; options.write_buffer_size = FLAGS_memtable_size;
rocksdb::BlockBasedTableOptions table_options; rocksdb::BlockBasedTableOptions table_options;
table_options.block_cache = rocksdb::NewLRUCache(FLAGS_block_cache_size); table_options.block_cache = rocksdb::NewLRUCache(FLAGS_block_cache_size);

@ -73,12 +73,13 @@
error_if_exists=false error_if_exists=false
recycle_log_file_num=0 recycle_log_file_num=0
skip_log_error_on_recovery=false skip_log_error_on_recovery=false
allow_mmap_reads=false
allow_os_buffer=true
db_log_dir= db_log_dir=
new_table_reader_for_compaction_inputs=true new_table_reader_for_compaction_inputs=true
allow_mmap_reads=false
allow_mmap_writes=false allow_mmap_writes=false
use_direct_reads=false
use_direct_writes=false
[CFOptions "default"] [CFOptions "default"]
compaction_style=kCompactionStyleLevel compaction_style=kCompactionStyleLevel
@ -127,7 +128,7 @@
write_buffer_size=134217728 write_buffer_size=134217728
disable_auto_compactions=false disable_auto_compactions=false
inplace_update_support=false inplace_update_support=false
[TableOptions/BlockBasedTable "default"] [TableOptions/BlockBasedTable "default"]
format_version=2 format_version=2
whole_key_filtering=true whole_key_filtering=true

@ -657,12 +657,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manifest_preallocation_size(
extern ROCKSDB_LIBRARY_API void extern ROCKSDB_LIBRARY_API void
rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t*, rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t*,
unsigned char); unsigned char);
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_os_buffer(
rocksdb_options_t*, unsigned char);
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads( extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads(
rocksdb_options_t*, unsigned char); rocksdb_options_t*, unsigned char);
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes( extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes(
rocksdb_options_t*, unsigned char); rocksdb_options_t*, unsigned char);
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_reads(
rocksdb_options_t*, unsigned char);
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_writes(
rocksdb_options_t*, unsigned char);
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec( extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec(
rocksdb_options_t*, unsigned char); rocksdb_options_t*, unsigned char);
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_log_error_on_recovery( extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_log_error_on_recovery(

@ -61,9 +61,6 @@ struct EnvOptions {
// construct from Options // construct from Options
explicit EnvOptions(const DBOptions& options); explicit EnvOptions(const DBOptions& options);
// If true, then allow caching of data in environment buffers
bool use_os_buffer = true;
// If true, then use mmap to read data // If true, then use mmap to read data
bool use_mmap_reads = false; bool use_mmap_reads = false;
@ -373,8 +370,8 @@ class Env {
// OptimizeForManifestWrite will create a new EnvOptions object that is a copy // OptimizeForManifestWrite will create a new EnvOptions object that is a copy
// of the EnvOptions in the parameters, but is optimized for writing manifest // of the EnvOptions in the parameters, but is optimized for writing manifest
// files. Default implementation returns the copy of the same object. // files. Default implementation returns the copy of the same object.
virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) virtual EnvOptions OptimizeForManifestWrite(
const; const EnvOptions& env_options) const;
// Returns the status of all threads that belong to the current Env. // Returns the status of all threads that belong to the current Env.
virtual Status GetThreadList(std::vector<ThreadStatus>* thread_list) { virtual Status GetThreadList(std::vector<ThreadStatus>* thread_list) {
@ -512,17 +509,15 @@ class WritableFile {
} }
virtual ~WritableFile(); virtual ~WritableFile();
// Indicates if the class makes use of unbuffered I/O // Indicates if the class makes use of direct IO
// If false you must pass aligned buffer to Write() // If true you must pass aligned buffer to Write()
virtual bool UseOSBuffer() const { virtual bool UseDirectIO() const { return false; }
return true;
}
const size_t c_DefaultPageSize = 4 * 1024; const size_t c_DefaultPageSize = 4 * 1024;
// Use the returned alignment value to allocate // Use the returned alignment value to allocate
// aligned buffer for Write() when UseOSBuffer() // aligned buffer for Write() when UseDirectIO()
// returns false // returns true
virtual size_t GetRequiredBufferAlignment() const { virtual size_t GetRequiredBufferAlignment() const {
return c_DefaultPageSize; return c_DefaultPageSize;
} }
@ -538,7 +533,7 @@ class WritableFile {
// the sector. The implementation thus needs to also rewrite the last // the sector. The implementation thus needs to also rewrite the last
// partial sector. // partial sector.
// Note: PositionAppend does not guarantee moving the file offset after the // Note: PositionAppend does not guarantee moving the file offset after the
// write. A WriteabelFile object must support either Append or // write. A WritableFile object must support either Append or
// PositionedAppend, so the users cannot mix the two. // PositionedAppend, so the users cannot mix the two.
// //
// PositionedAppend() can only happen on the page/sector boundaries. For that // PositionedAppend() can only happen on the page/sector boundaries. For that
@ -583,10 +578,6 @@ class WritableFile {
return false; return false;
} }
// Indicates the upper layers if the current WritableFile implementation
// uses direct IO.
virtual bool UseDirectIO() const { return false; }
/* /*
* Change the priority in rate limiter if rate limiting is enabled. * Change the priority in rate limiter if rate limiting is enabled.
* If rate limiting is not enabled, this call has no effect. * If rate limiting is not enabled, this call has no effect.
@ -695,17 +686,14 @@ class RandomRWFile {
RandomRWFile() {} RandomRWFile() {}
virtual ~RandomRWFile() {} virtual ~RandomRWFile() {}
// Indicates if the class makes use of unbuffered I/O // Indicates if the class makes use of direct I/O
// If false you must pass aligned buffer to Write() // If false you must pass aligned buffer to Write()
virtual bool UseOSBuffer() const { virtual bool UseDirectIO() const { return false; }
return true;
}
const size_t c_DefaultPageSize = 4 * 1024; const size_t c_DefaultPageSize = 4 * 1024;
// Use the returned alignment value to allocate // Use the returned alignment value to allocate aligned
// aligned buffer for Write() when UseOSBuffer() // buffer for Write() when UseDirectIO() returns true
// returns false
virtual size_t GetRequiredBufferAlignment() const { virtual size_t GetRequiredBufferAlignment() const {
return c_DefaultPageSize; return c_DefaultPageSize;
} }
@ -722,7 +710,7 @@ class RandomRWFile {
virtual void EnableReadAhead() {} virtual void EnableReadAhead() {}
// Write bytes in `data` at offset `offset`, Returns Status::OK() on success. // Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
// Pass aligned buffer when UseOSBuffer() returns false. // Pass aligned buffer when UseDirectIO() returns true.
virtual Status Write(uint64_t offset, const Slice& data) = 0; virtual Status Write(uint64_t offset, const Slice& data) = 0;
// Read up to `n` bytes starting from offset `offset` and store them in // Read up to `n` bytes starting from offset `offset` and store them in

@ -1100,26 +1100,6 @@ struct DBOptions {
// large amounts of data (such as xfs's allocsize option). // large amounts of data (such as xfs's allocsize option).
size_t manifest_preallocation_size; size_t manifest_preallocation_size;
// Hint the OS that it should not buffer disk I/O. Enabling this
// parameter may improve performance but increases pressure on the
// system cache.
//
// The exact behavior of this parameter is platform dependent.
//
// On POSIX systems, after RocksDB reads data from disk it will
// mark the pages as "unneeded". The operating system may - or may not
// - evict these pages from memory, reducing pressure on the system
// cache. If the disk block is requested again this can result in
// additional disk I/O.
//
// On WINDOWS system, files will be opened in "unbuffered I/O" mode
// which means that data read from the disk will not be cached or
// bufferized. The hardware buffer of the devices may however still
// be used. Memory mapped files are not impacted by this parameter.
//
// Default: true
bool allow_os_buffer;
// Allow the OS to mmap file for reading sst tables. Default: false // Allow the OS to mmap file for reading sst tables. Default: false
bool allow_mmap_reads; bool allow_mmap_reads;
@ -1128,10 +1108,22 @@ struct DBOptions {
// Default: false // Default: false
bool allow_mmap_writes; bool allow_mmap_writes;
// Enable direct I/O mode for read/write
// they may or may not improve performance depending on the use case
//
// Files will be opened in "direct I/O" mode
// which means that data r/w from the disk will not be cached or
// bufferized. The hardware buffer of the devices may however still
// be used. Memory mapped files are not impacted by these parameters.
// Use O_DIRECT for reading file // Use O_DIRECT for reading file
// Default: false // Default: false
bool use_direct_reads; bool use_direct_reads;
// Use O_DIRECT for writing file
// Default: false
bool use_direct_writes;
// If false, fallocate() calls are bypassed // If false, fallocate() calls are bypassed
bool allow_fallocate; bool allow_fallocate;

@ -157,9 +157,9 @@ struct BlockBasedTableOptions {
// a SstTable. Instead, buffer in WritableFileWriter will take // a SstTable. Instead, buffer in WritableFileWriter will take
// care of the flushing when it is full. // care of the flushing when it is full.
// //
// On Windows, this option helps a lot when unbuffered I/O // This option helps a lot when direct I/O writes
// (allow_os_buffer = false) is used, since it avoids small // (use_direct_writes = true) is used, since it avoids small
// unbuffered disk write. // direct disk write.
// //
// User may also adjust writable_file_max_buffer_size to optimize disk I/O // User may also adjust writable_file_max_buffer_size to optimize disk I/O
// size. // size.

@ -1,5 +1,7 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// vim: ts=8 sw=2 smarttab // This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_UTILITIES_ENV_LIBRADOS_H #ifndef ROCKSDB_UTILITIES_ENV_LIBRADOS_H
#define ROCKSDB_UTILITIES_ENV_LIBRADOS_H #define ROCKSDB_UTILITIES_ENV_LIBRADOS_H
@ -15,17 +17,16 @@ namespace rocksdb {
class LibradosWritableFile; class LibradosWritableFile;
class EnvLibrados : public EnvWrapper { class EnvLibrados : public EnvWrapper {
public: public:
// Create a brand new sequentially-readable file with the specified name. // Create a brand new sequentially-readable file with the specified name.
// On success, stores a pointer to the new file in *result and returns OK. // On success, stores a pointer to the new file in *result and returns OK.
// On failure stores nullptr in *result and returns non-OK. If the file does // On failure stores nullptr in *result and returns non-OK. If the file does
// not exist, returns a non-OK status. // not exist, returns a non-OK status.
// //
// The returned file will only be accessed by one thread at a time. // The returned file will only be accessed by one thread at a time.
Status NewSequentialFile( Status NewSequentialFile(const std::string& fname,
const std::string& fname, std::unique_ptr<SequentialFile>* result,
std::unique_ptr<SequentialFile>* result, const EnvOptions& options) override;
const EnvOptions& options);
// Create a brand new random access read-only file with the // Create a brand new random access read-only file with the
// specified name. On success, stores a pointer to the new file in // specified name. On success, stores a pointer to the new file in
@ -34,10 +35,9 @@ public:
// status. // status.
// //
// The returned file may be concurrently accessed by multiple threads. // The returned file may be concurrently accessed by multiple threads.
Status NewRandomAccessFile( Status NewRandomAccessFile(const std::string& fname,
const std::string& fname, std::unique_ptr<RandomAccessFile>* result,
std::unique_ptr<RandomAccessFile>* result, const EnvOptions& options) override;
const EnvOptions& options);
// Create an object that writes to a new file with the specified // Create an object that writes to a new file with the specified
// name. Deletes any existing file with the same name and creates a // name. Deletes any existing file with the same name and creates a
@ -46,17 +46,15 @@ public:
// returns non-OK. // returns non-OK.
// //
// The returned file will only be accessed by one thread at a time. // The returned file will only be accessed by one thread at a time.
Status NewWritableFile( Status NewWritableFile(const std::string& fname,
const std::string& fname, std::unique_ptr<WritableFile>* result,
std::unique_ptr<WritableFile>* result, const EnvOptions& options) override;
const EnvOptions& options);
// Reuse an existing file by renaming it and opening it as writable. // Reuse an existing file by renaming it and opening it as writable.
Status ReuseWritableFile( Status ReuseWritableFile(const std::string& fname,
const std::string& fname, const std::string& old_fname,
const std::string& old_fname, std::unique_ptr<WritableFile>* result,
std::unique_ptr<WritableFile>* result, const EnvOptions& options) override;
const EnvOptions& options);
// Create an object that represents a directory. Will fail if directory // Create an object that represents a directory. Will fail if directory
// doesn't exist. If the directory exists, it will open the directory // doesn't exist. If the directory exists, it will open the directory
@ -65,47 +63,44 @@ public:
// On success, stores a pointer to the new Directory in // On success, stores a pointer to the new Directory in
// *result and returns OK. On failure stores nullptr in *result and // *result and returns OK. On failure stores nullptr in *result and
// returns non-OK. // returns non-OK.
Status NewDirectory( Status NewDirectory(const std::string& name,
const std::string& name, std::unique_ptr<Directory>* result) override;
std::unique_ptr<Directory>* result);
// Returns OK if the named file exists. // Returns OK if the named file exists.
// NotFound if the named file does not exist, // NotFound if the named file does not exist,
// the calling process does not have permission to determine // the calling process does not have permission to determine
// whether this file exists, or if the path is invalid. // whether this file exists, or if the path is invalid.
// IOError if an IO Error was encountered // IOError if an IO Error was encountered
Status FileExists(const std::string& fname); Status FileExists(const std::string& fname) overrdie;
// Store in *result the names of the children of the specified directory. // Store in *result the names of the children of the specified directory.
// The names are relative to "dir". // The names are relative to "dir".
// Original contents of *results are dropped. // Original contents of *results are dropped.
Status GetChildren(const std::string& dir, Status GetChildren(const std::string& dir, std::vector<std::string>* result);
std::vector<std::string>* result);
// Delete the named file. // Delete the named file.
Status DeleteFile(const std::string& fname); Status DeleteFile(const std::string& fname) override;
// Create the specified directory. Returns error if directory exists. // Create the specified directory. Returns error if directory exists.
Status CreateDir(const std::string& dirname); Status CreateDir(const std::string& dirname) override;
// Creates directory if missing. Return Ok if it exists, or successful in // Creates directory if missing. Return Ok if it exists, or successful in
// Creating. // Creating.
Status CreateDirIfMissing(const std::string& dirname); Status CreateDirIfMissing(const std::string& dirname) override;
// Delete the specified directory. // Delete the specified directory.
Status DeleteDir(const std::string& dirname); Status DeleteDir(const std::string& dirname) override;
// Store the size of fname in *file_size. // Store the size of fname in *file_size.
Status GetFileSize(const std::string& fname, uint64_t* file_size); Status GetFileSize(const std::string& fname, uint64_t* file_size) override;
// Store the last modification time of fname in *file_mtime. // Store the last modification time of fname in *file_mtime.
Status GetFileModificationTime(const std::string& fname, Status GetFileModificationTime(const std::string& fname,
uint64_t* file_mtime); uint64_t* file_mtime) override;
// Rename file src to target. // Rename file src to target.
Status RenameFile(const std::string& src, Status RenameFile(const std::string& src, const std::string& target) override;
const std::string& target);
// Hard Link file src to target. // Hard Link file src to target.
Status LinkFile(const std::string& src, const std::string& target); Status LinkFile(const std::string& src, const std::string& target) override;
// Lock the specified file. Used to prevent concurrent access to // Lock the specified file. Used to prevent concurrent access to
// the same db by multiple processes. On failure, stores nullptr in // the same db by multiple processes. On failure, stores nullptr in
@ -129,8 +124,7 @@ public:
Status UnlockFile(FileLock* lock); Status UnlockFile(FileLock* lock);
// Get full directory name for this db. // Get full directory name for this db.
Status GetAbsolutePath(const std::string& db_path, Status GetAbsolutePath(const std::string& db_path, std::string* output_path);
std::string* output_path);
// Generate unique id // Generate unique id
std::string GenerateUniqueId(); std::string GenerateUniqueId();
@ -142,31 +136,29 @@ public:
const std::string& config_path, const std::string& config_path,
const std::string& db_pool); const std::string& db_pool);
explicit EnvLibrados(const std::string& client_name, // first 3 parameters are for RADOS client init explicit EnvLibrados(
const std::string& cluster_name, const std::string& client_name, // first 3 parameters are
const uint64_t flags, // for RADOS client init
const std::string& db_name, const std::string& cluster_name, const uint64_t flags,
const std::string& config_path, const std::string& db_name, const std::string& config_path,
const std::string& db_pool, const std::string& db_pool, const std::string& wal_dir,
const std::string& wal_dir, const std::string& wal_pool, const uint64_t write_buffer_size);
const std::string& wal_pool, ~EnvLibrados() { _rados.shutdown(); }
const uint64_t write_buffer_size);
~EnvLibrados() { private:
_rados.shutdown();
}
private:
std::string _client_name; std::string _client_name;
std::string _cluster_name; std::string _cluster_name;
uint64_t _flags; uint64_t _flags;
std::string _db_name; // get from user, readable string; Also used as db_id for db metadata std::string _db_name; // get from user, readable string; Also used as db_id
// for db metadata
std::string _config_path; std::string _config_path;
librados::Rados _rados; // RADOS client librados::Rados _rados; // RADOS client
std::string _db_pool_name; std::string _db_pool_name;
librados::IoCtx _db_pool_ioctx; // IoCtx for connecting db_pool librados::IoCtx _db_pool_ioctx; // IoCtx for connecting db_pool
std::string _wal_dir; // WAL dir path std::string _wal_dir; // WAL dir path
std::string _wal_pool_name; std::string _wal_pool_name;
librados::IoCtx _wal_pool_ioctx; // IoCtx for connecting wal_pool librados::IoCtx _wal_pool_ioctx; // IoCtx for connecting wal_pool
uint64_t _write_buffer_size; // WritableFile buffer max size uint64_t _write_buffer_size; // WritableFile buffer max size
/* private function to communicate with rados */ /* private function to communicate with rados */
std::string _CreateFid(); std::string _CreateFid();
@ -175,10 +167,8 @@ private:
Status _RenameFid(const std::string& old_fname, const std::string& new_fname); Status _RenameFid(const std::string& old_fname, const std::string& new_fname);
Status _AddFid(const std::string& fname, const std::string& fid); Status _AddFid(const std::string& fname, const std::string& fid);
Status _DelFid(const std::string& fname); Status _DelFid(const std::string& fname);
Status _GetSubFnames( Status _GetSubFnames(const std::string& dirname,
const std::string& dirname, std::vector<std::string>* result);
std::vector<std::string> * result
);
librados::IoCtx* _GetIoctx(const std::string& prefix); librados::IoCtx* _GetIoctx(const std::string& prefix);
friend class LibradosWritableFile; friend class LibradosWritableFile;
}; };

@ -15,10 +15,9 @@
// semantics and behavior are correct (in that they match that of an // semantics and behavior are correct (in that they match that of an
// existing, stable Env, like the default POSIX one). // existing, stable Env, like the default POSIX one).
#ifndef ROCKSDB_LITE #pragma once
#ifndef STORAGE_ROCKSDB_INCLUDE_UTILITIES_ENVMIRROR_H_ #ifndef ROCKSDB_LITE
#define STORAGE_ROCKSDB_INCLUDE_UTLIITIES_ENVMIRROR_H_
#include <iostream> #include <iostream>
#include <algorithm> #include <algorithm>
@ -174,6 +173,4 @@ class EnvMirror : public EnvWrapper {
} // namespace rocksdb } // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_UTILITIES_ENVMIRROR_H_
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE

@ -1417,12 +1417,18 @@ public class DbBenchmark {
} }
}, },
/* TODO(yhchiang): enable the following /* TODO(yhchiang): enable the following
bufferedio(rocksdb::EnvOptions().use_os_buffer, direct_reads(rocksdb::EnvOptions().use_direct_reads,
"Allow buffered io using OS buffers.") { "Allow direct I/O reads.") {
@Override public Object parseValue(String value) { @Override public Object parseValue(String value) {
return parseBoolean(value); return parseBoolean(value);
} }
}, },
direct_writes(rocksdb::EnvOptions().use_direct_reads,
"Allow direct I/O reads.") {
@Override public Object parseValue(String value) {
return parseBoolean(value);
}
},
*/ */
mmap_read(false, mmap_read(false,
"Allow reads to occur via mmap-ing files.") { "Allow reads to occur via mmap-ing files.") {

@ -49,23 +49,43 @@ void Java_org_rocksdb_EnvOptions_disposeInternal(JNIEnv *env, jobject jobj,
/* /*
* Class: org_rocksdb_EnvOptions * Class: org_rocksdb_EnvOptions
* Method: setUseOsBuffer * Method: setUseDirectReads
* Signature: (JZ)V * Signature: (JZ)V
*/ */
void Java_org_rocksdb_EnvOptions_setUseOsBuffer(JNIEnv *env, jobject jobj, void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *env, jobject jobj,
jlong jhandle, jlong jhandle,
jboolean use_os_buffer) { jboolean use_direct_reads) {
ENV_OPTIONS_SET_BOOL(jhandle, use_os_buffer); ENV_OPTIONS_SET_BOOL(jhandle, use_direct_reads);
} }
/* /*
* Class: org_rocksdb_EnvOptions * Class: org_rocksdb_EnvOptions
* Method: useOsBuffer * Method: useDirectReads
* Signature: (J)Z * Signature: (J)Z
*/ */
jboolean Java_org_rocksdb_EnvOptions_useOsBuffer(JNIEnv *env, jobject jobj, jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *env, jobject jobj,
jlong jhandle) { jlong jhandle) {
return ENV_OPTIONS_GET(jhandle, use_os_buffer); return ENV_OPTIONS_GET(jhandle, use_direct_reads);
}
/*
* Class: org_rocksdb_EnvOptions
* Method: setUseDirectWrites
* Signature: (JZ)V
*/
void Java_org_rocksdb_EnvOptions_setUseDirectWrites(
JNIEnv *env, jobject jobj, jlong jhandle, jboolean use_direct_writes) {
ENV_OPTIONS_SET_BOOL(jhandle, use_direct_writes);
}
/*
* Class: org_rocksdb_EnvOptions
* Method: useDirectWrites
* Signature: (J)Z
*/
jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *env, jobject jobj,
jlong jhandle) {
return ENV_OPTIONS_GET(jhandle, use_direct_writes);
} }
/* /*
@ -110,47 +130,6 @@ jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *env, jobject jobj,
return ENV_OPTIONS_GET(jhandle, use_mmap_writes); return ENV_OPTIONS_GET(jhandle, use_mmap_writes);
} }
/*
* Class: org_rocksdb_EnvOptions
* Method: setUseDirectReads
* Signature: (JZ)V
*/
void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *env, jobject jobj,
jlong jhandle,
jboolean use_direct_reads) {
ENV_OPTIONS_SET_BOOL(jhandle, use_direct_reads);
}
/*
* Class: org_rocksdb_EnvOptions
* Method: useDirectReads
* Signature: (J)Z
*/
jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *env, jobject jobj,
jlong jhandle) {
return ENV_OPTIONS_GET(jhandle, use_direct_reads);
}
/*
* Class: org_rocksdb_EnvOptions
* Method: setUseDirectWrites
* Signature: (JZ)V
*/
void Java_org_rocksdb_EnvOptions_setUseDirectWrites(
JNIEnv *env, jobject jobj, jlong jhandle, jboolean use_direct_writes) {
ENV_OPTIONS_SET_BOOL(jhandle, use_direct_writes);
}
/*
* Class: org_rocksdb_EnvOptions
* Method: useDirectWrites
* Signature: (J)Z
*/
jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *env, jobject jobj,
jlong jhandle) {
return ENV_OPTIONS_GET(jhandle, use_direct_writes);
}
/* /*
* Class: org_rocksdb_EnvOptions * Class: org_rocksdb_EnvOptions
* Method: setAllowFallocate * Method: setAllowFallocate

@ -870,27 +870,6 @@ void Java_org_rocksdb_Options_setManifestPreallocationSize(
} }
} }
/*
* Class: org_rocksdb_Options
* Method: allowOsBuffer
* Signature: (J)Z
*/
jboolean Java_org_rocksdb_Options_allowOsBuffer(
JNIEnv* env, jobject jobj, jlong jhandle) {
return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_os_buffer;
}
/*
* Class: org_rocksdb_Options
* Method: setAllowOsBuffer
* Signature: (JZ)V
*/
void Java_org_rocksdb_Options_setAllowOsBuffer(
JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_os_buffer) {
reinterpret_cast<rocksdb::Options*>(jhandle)->allow_os_buffer =
static_cast<bool>(allow_os_buffer);
}
/* /*
* Method: setTableFactory * Method: setTableFactory
* Signature: (JJ)V * Signature: (JJ)V
@ -943,6 +922,50 @@ void Java_org_rocksdb_Options_setAllowMmapWrites(
static_cast<bool>(allow_mmap_writes); static_cast<bool>(allow_mmap_writes);
} }
/*
* Class: org_rocksdb_Options
* Method: useDirectReads
* Signature: (J)Z
*/
jboolean Java_org_rocksdb_Options_useDirectReads(JNIEnv* env, jobject jobj,
jlong jhandle) {
return reinterpret_cast<rocksdb::Options*>(jhandle)->use_direct_reads;
}
/*
* Class: org_rocksdb_Options
* Method: setUseDirectReads
* Signature: (JZ)V
*/
void Java_org_rocksdb_Options_setUseDirectReads(JNIEnv* env, jobject jobj,
jlong jhandle,
jboolean use_direct_reads) {
reinterpret_cast<rocksdb::Options*>(jhandle)->use_direct_reads =
static_cast<bool>(use_direct_reads);
}
/*
* Class: org_rocksdb_Options
* Method: useDirectWrites
* Signature: (J)Z
*/
jboolean Java_org_rocksdb_Options_useDirectWrites(JNIEnv* env, jobject jobj,
jlong jhandle) {
return reinterpret_cast<rocksdb::Options*>(jhandle)->use_direct_writes;
}
/*
* Class: org_rocksdb_Options
* Method: setUseDirectReads
* Signature: (JZ)V
*/
void Java_org_rocksdb_Options_setUseDirectWrites(JNIEnv* env, jobject jobj,
jlong jhandle,
jboolean use_direct_writes) {
reinterpret_cast<rocksdb::Options*>(jhandle)->use_direct_writes =
static_cast<bool>(use_direct_writes);
}
/* /*
* Class: org_rocksdb_Options * Class: org_rocksdb_Options
* Method: isFdCloseOnExec * Method: isFdCloseOnExec
@ -4144,23 +4167,46 @@ jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize(
/* /*
* Class: org_rocksdb_DBOptions * Class: org_rocksdb_DBOptions
* Method: setAllowOsBuffer * Method: useDirectReads
* Signature: (J)Z
*/
jboolean Java_org_rocksdb_DBOptions_useDirectReads(JNIEnv* env, jobject jobj,
jlong jhandle) {
return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_direct_reads;
}
/*
* Class: org_rocksdb_DBOptions
* Method: setUseDirectReads
* Signature: (JZ)V * Signature: (JZ)V
*/ */
void Java_org_rocksdb_DBOptions_setAllowOsBuffer( void Java_org_rocksdb_DBOptions_setUseDirectReads(JNIEnv* env, jobject jobj,
JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_os_buffer) { jlong jhandle,
reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_os_buffer = jboolean use_direct_reads) {
static_cast<bool>(allow_os_buffer); reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_direct_reads =
static_cast<bool>(use_direct_reads);
} }
/* /*
* Class: org_rocksdb_DBOptions * Class: org_rocksdb_DBOptions
* Method: allowOsBuffer * Method: useDirectWrites
* Signature: (J)Z * Signature: (J)Z
*/ */
jboolean Java_org_rocksdb_DBOptions_allowOsBuffer( jboolean Java_org_rocksdb_DBOptions_useDirectWrites(JNIEnv* env, jobject jobj,
JNIEnv* env, jobject jobj, jlong jhandle) { jlong jhandle) {
return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_os_buffer; return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_direct_writes;
}
/*
* Class: org_rocksdb_DBOptions
* Method: setUseDirectReads
* Signature: (JZ)V
*/
void Java_org_rocksdb_DBOptions_setUseDirectWrites(JNIEnv* env, jobject jobj,
jlong jhandle,
jboolean use_direct_writes) {
reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_direct_writes =
static_cast<bool>(use_direct_writes);
} }
/* /*

@ -457,17 +457,31 @@ public class DBOptions extends RocksObject implements DBOptionsInterface {
} }
@Override @Override
public DBOptions setAllowOsBuffer( public DBOptions setUseDirectReads(
final boolean allowOsBuffer) { final boolean useDirectReads) {
assert(isOwningHandle()); assert(isOwningHandle());
setAllowOsBuffer(nativeHandle_, allowOsBuffer); setUseDirectReads(nativeHandle_, useDirectReads);
return this; return this;
} }
@Override @Override
public boolean allowOsBuffer() { public boolean useDirectReads() {
assert(isOwningHandle()); assert(isOwningHandle());
return allowOsBuffer(nativeHandle_); return useDirectReads(nativeHandle_);
}
@Override
public DBOptions setUseDirectWrites(
final boolean useDirectWrites) {
assert(isOwningHandle());
setUseDirectWrites(nativeHandle_, useDirectWrites);
return this;
}
@Override
public boolean useDirectWrites() {
assert(isOwningHandle());
return useDirectWrites(nativeHandle_);
} }
@Override @Override
@ -710,9 +724,10 @@ public long delayedWriteRate(){
private native void setManifestPreallocationSize( private native void setManifestPreallocationSize(
long handle, long size) throws IllegalArgumentException; long handle, long size) throws IllegalArgumentException;
private native long manifestPreallocationSize(long handle); private native long manifestPreallocationSize(long handle);
private native void setAllowOsBuffer( private native void setUseDirectReads(long handle, boolean useDirectReads);
long handle, boolean allowOsBuffer); private native boolean useDirectReads(long handle);
private native boolean allowOsBuffer(long handle); private native void setUseDirectWrites(long handle, boolean useDirectWrites);
private native boolean useDirectWrites(long handle);
private native void setAllowMmapReads( private native void setAllowMmapReads(
long handle, boolean allowMmapReads); long handle, boolean allowMmapReads);
private native boolean allowMmapReads(long handle); private native boolean allowMmapReads(long handle);

@ -673,21 +673,38 @@ public interface DBOptionsInterface {
long manifestPreallocationSize(); long manifestPreallocationSize();
/** /**
* Data being read from file storage may be buffered in the OS * Enable the OS to use direct I/O for reading sst tables.
* Default: true * Default: false
* *
* @param allowOsBuffer if true, then OS buffering is allowed. * @param useDirectReads if true, then direct read is enabled
* @return the instance of the current Object. * @return the instance of the current Object.
*/ */
Object setAllowOsBuffer(boolean allowOsBuffer); Object setUseDirectReads(boolean useDirectReads);
/** /**
* Data being read from file storage may be buffered in the OS * Enable the OS to use direct I/O for reading sst tables.
* Default: true * Default: false
*
* @return if true, then direct reads are enabled
*/
boolean useDirectReads();
/**
* Enable the OS to use direct I/O for writing sst tables.
* Default: false
*
* @param useDirectWrites if true, then direct write is enabled
* @return the instance of the current Object.
*/
Object setUseDirectWrites(boolean useDirectWrites);
/**
* Enable the OS to use direct I/O for writing sst tables.
* Default: false
* *
* @return if true, then OS buffering is allowed. * @return if true, then direct writes are enabled
*/ */
boolean allowOsBuffer(); boolean useDirectWrites();
/** /**
* Allow the OS to mmap file for reading sst tables. * Allow the OS to mmap file for reading sst tables.

@ -530,18 +530,32 @@ public class Options extends RocksObject
} }
@Override @Override
public boolean allowOsBuffer() { public Options setUseDirectReads(final boolean useDirectReads) {
assert(isOwningHandle()); assert(isOwningHandle());
return allowOsBuffer(nativeHandle_); setUseDirectReads(nativeHandle_, useDirectReads);
return this;
}
@Override
public boolean useDirectReads() {
assert(isOwningHandle());
return useDirectReads(nativeHandle_);
} }
@Override @Override
public Options setAllowOsBuffer(final boolean allowOsBuffer) { public Options setUseDirectWrites(final boolean useDirectWrites) {
assert(isOwningHandle()); assert(isOwningHandle());
setAllowOsBuffer(nativeHandle_, allowOsBuffer); setUseDirectWrites(nativeHandle_, useDirectWrites);
return this; return this;
} }
@Override
public boolean useDirectWrites() {
assert(isOwningHandle());
return useDirectWrites(nativeHandle_);
}
@Override @Override
public boolean allowMmapReads() { public boolean allowMmapReads() {
assert(isOwningHandle()); assert(isOwningHandle());
@ -1289,9 +1303,10 @@ public class Options extends RocksObject
private native void setManifestPreallocationSize( private native void setManifestPreallocationSize(
long handle, long size) throws IllegalArgumentException; long handle, long size) throws IllegalArgumentException;
private native long manifestPreallocationSize(long handle); private native long manifestPreallocationSize(long handle);
private native void setAllowOsBuffer( private native void setUseDirectReads(long handle, boolean useDirectReads);
long handle, boolean allowOsBuffer); private native boolean useDirectReads(long handle);
private native boolean allowOsBuffer(long handle); private native void setUseDirectWrites(long handle, boolean useDirectWrites);
private native boolean useDirectWrites(long handle);
private native void setAllowMmapReads( private native void setAllowMmapReads(
long handle, boolean allowMmapReads); long handle, boolean allowMmapReads);
private native boolean allowMmapReads(long handle); private native boolean allowMmapReads(long handle);

@ -281,11 +281,20 @@ public class DBOptionsTest {
} }
@Test @Test
public void allowOsBuffer() { public void useDirectReads() {
try(final DBOptions opt = new DBOptions()) { try(final DBOptions opt = new DBOptions()) {
final boolean boolValue = rand.nextBoolean(); final boolean boolValue = rand.nextBoolean();
opt.setAllowOsBuffer(boolValue); opt.setUseDirectReads(boolValue);
assertThat(opt.allowOsBuffer()).isEqualTo(boolValue); assertThat(opt.useDirectReads()).isEqualTo(boolValue);
}
}
@Test
public void useDirectWrites() {
try(final DBOptions opt = new DBOptions()) {
final boolean boolValue = rand.nextBoolean();
opt.setUseDirectWrites(boolValue);
assertThat(opt.useDirectWrites()).isEqualTo(boolValue);
} }
} }

@ -18,15 +18,6 @@ public class EnvOptionsTest {
public static final Random rand = PlatformRandomHelper.getPlatformSpecificRandomFactory(); public static final Random rand = PlatformRandomHelper.getPlatformSpecificRandomFactory();
@Test
public void useOsBuffer() {
try (final EnvOptions envOptions = new EnvOptions()) {
final boolean boolValue = rand.nextBoolean();
envOptions.setUseOsBuffer(boolValue);
assertThat(envOptions.useOsBuffer()).isEqualTo(boolValue);
}
}
@Test @Test
public void useMmapReads() { public void useMmapReads() {
try (final EnvOptions envOptions = new EnvOptions()) { try (final EnvOptions envOptions = new EnvOptions()) {

@ -572,11 +572,20 @@ public class OptionsTest {
} }
@Test @Test
public void allowOsBuffer() { public void useDirectReads() {
try (final Options opt = new Options()) { try(final Options opt = new Options()) {
final boolean boolValue = rand.nextBoolean();
opt.setUseDirectReads(boolValue);
assertThat(opt.useDirectReads()).isEqualTo(boolValue);
}
}
@Test
public void useDirectWrites() {
try(final Options opt = new Options()) {
final boolean boolValue = rand.nextBoolean(); final boolean boolValue = rand.nextBoolean();
opt.setAllowOsBuffer(boolValue); opt.setUseDirectWrites(boolValue);
assertThat(opt.allowOsBuffer()).isEqualTo(boolValue); assertThat(opt.useDirectWrites()).isEqualTo(boolValue);
} }
} }

@ -7,9 +7,10 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "port/win/env_win.h"
#include <algorithm> #include <algorithm>
#include <thread>
#include <ctime> #include <ctime>
#include <thread>
#include <errno.h> #include <errno.h>
#include <process.h> // _getpid #include <process.h> // _getpid
@ -25,7 +26,6 @@
#include "port/dirent.h" #include "port/dirent.h"
#include "port/win/win_logger.h" #include "port/win/win_logger.h"
#include "port/win/io_win.h" #include "port/win/io_win.h"
#include "port/win/env_win.h"
#include "util/iostats_context_imp.h" #include "util/iostats_context_imp.h"
@ -148,7 +148,7 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
// Random access is to disable read-ahead as the system reads too much data // Random access is to disable read-ahead as the system reads too much data
DWORD fileFlags = FILE_ATTRIBUTE_READONLY; DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
if (!options.use_os_buffer && !options.use_mmap_reads) { if (options.use_direct_reads && !options.use_mmap_reads) {
fileFlags |= FILE_FLAG_NO_BUFFERING; fileFlags |= FILE_FLAG_NO_BUFFERING;
} else { } else {
fileFlags |= FILE_FLAG_RANDOM_ACCESS; fileFlags |= FILE_FLAG_RANDOM_ACCESS;
@ -229,8 +229,8 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
} }
Status WinEnvIO::NewWritableFile(const std::string& fname, Status WinEnvIO::NewWritableFile(const std::string& fname,
std::unique_ptr<WritableFile>* result, std::unique_ptr<WritableFile>* result,
const EnvOptions& options) { const EnvOptions& options) {
const size_t c_BufferCapacity = 64 * 1024; const size_t c_BufferCapacity = 64 * 1024;
EnvOptions local_options(options); EnvOptions local_options(options);
@ -240,7 +240,7 @@ Status WinEnvIO::NewWritableFile(const std::string& fname,
DWORD fileFlags = FILE_ATTRIBUTE_NORMAL; DWORD fileFlags = FILE_ATTRIBUTE_NORMAL;
if (!local_options.use_os_buffer && !local_options.use_mmap_writes) { if (local_options.use_direct_writes && !local_options.use_mmap_writes) {
fileFlags = FILE_FLAG_NO_BUFFERING; fileFlags = FILE_FLAG_NO_BUFFERING;
} }
@ -305,7 +305,7 @@ Status WinEnvIO::NewRandomRWFile(const std::string & fname,
DWORD creation_disposition = OPEN_ALWAYS; // Create if necessary or open existing DWORD creation_disposition = OPEN_ALWAYS; // Create if necessary or open existing
DWORD file_flags = FILE_FLAG_RANDOM_ACCESS; DWORD file_flags = FILE_FLAG_RANDOM_ACCESS;
if (!options.use_os_buffer) { if (options.use_direct_reads && options.use_direct_writes) {
file_flags |= FILE_FLAG_NO_BUFFERING; file_flags |= FILE_FLAG_NO_BUFFERING;
} }
@ -744,11 +744,11 @@ std::string WinEnvIO::TimeToString(uint64_t secondsSince1970) {
EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options, EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options,
const DBOptions& db_options) const { const DBOptions& db_options) const {
EnvOptions optimized = env_options; EnvOptions optimized = env_options;
optimized.use_mmap_writes = false;
optimized.bytes_per_sync = db_options.wal_bytes_per_sync; optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
optimized.use_os_buffer = optimized.use_mmap_writes = false;
true; // This is because we flush only whole pages on unbuffered io and // This is because we flush only whole pages on unbuffered io and
// the last records are not guaranteed to be flushed. // the last records are not guaranteed to be flushed.
optimized.use_direct_writes = false;
// TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
// breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
// test and make this false // test and make this false
@ -760,7 +760,7 @@ EnvOptions WinEnvIO::OptimizeForManifestWrite(
const EnvOptions& env_options) const { const EnvOptions& env_options) const {
EnvOptions optimized = env_options; EnvOptions optimized = env_options;
optimized.use_mmap_writes = false; optimized.use_mmap_writes = false;
optimized.use_os_buffer = true; optimized.use_direct_writes = false;
optimized.fallocate_with_keep_size = true; optimized.fallocate_with_keep_size = true;
return optimized; return optimized;
} }
@ -914,8 +914,8 @@ Status WinEnv::NewRandomAccessFile(const std::string& fname,
} }
Status WinEnv::NewWritableFile(const std::string& fname, Status WinEnv::NewWritableFile(const std::string& fname,
std::unique_ptr<WritableFile>* result, std::unique_ptr<WritableFile>* result,
const EnvOptions& options) { const EnvOptions& options) {
return winenv_io_.NewWritableFile(fname, result, options); return winenv_io_.NewWritableFile(fname, result, options);
} }

@ -89,8 +89,8 @@ public:
const EnvOptions& options); const EnvOptions& options);
virtual Status NewWritableFile(const std::string& fname, virtual Status NewWritableFile(const std::string& fname,
std::unique_ptr<WritableFile>* result, std::unique_ptr<WritableFile>* result,
const EnvOptions& options); const EnvOptions& options);
// The returned file will only be accessed by one thread at a time. // The returned file will only be accessed by one thread at a time.
virtual Status NewRandomRWFile(const std::string& fname, virtual Status NewRandomRWFile(const std::string& fname,
@ -190,8 +190,8 @@ public:
const EnvOptions& options) override; const EnvOptions& options) override;
Status NewWritableFile(const std::string& fname, Status NewWritableFile(const std::string& fname,
std::unique_ptr<WritableFile>* result, std::unique_ptr<WritableFile>* result,
const EnvOptions& options) override; const EnvOptions& options) override;
// The returned file will only be accessed by one thread at a time. // The returned file will only be accessed by one thread at a time.
Status NewRandomRWFile(const std::string& fname, Status NewRandomRWFile(const std::string& fname,

@ -12,7 +12,6 @@
#include "util/sync_point.h" #include "util/sync_point.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/iostats_context_imp.h" #include "util/iostats_context_imp.h"
#include "util/sync_point.h"
#include "util/aligned_buffer.h" #include "util/aligned_buffer.h"
@ -158,12 +157,14 @@ size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) {
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// WinMmapReadableFile // WinMmapReadableFile
WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap, WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
const void* mapped_region, size_t length) HANDLE hFile, HANDLE hMap,
: WinFileData(fileName, hFile, false), const void* mapped_region,
hMap_(hMap), size_t length)
mapped_region_(mapped_region), : WinFileData(fileName, hFile, false /* use_direct_io */),
length_(length) {} hMap_(hMap),
mapped_region_(mapped_region),
length_(length) {}
WinMmapReadableFile::~WinMmapReadableFile() { WinMmapReadableFile::~WinMmapReadableFile() {
BOOL ret = ::UnmapViewOfFile(mapped_region_); BOOL ret = ::UnmapViewOfFile(mapped_region_);
@ -521,9 +522,8 @@ size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
// WinSequentialFile // WinSequentialFile
WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f, WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
const EnvOptions& options) const EnvOptions& options)
: WinFileData(fname, f, options.use_os_buffer) : WinFileData(fname, f, options.use_direct_reads) {}
{}
WinSequentialFile::~WinSequentialFile() { WinSequentialFile::~WinSequentialFile() {
assert(hFile_ != INVALID_HANDLE_VALUE); assert(hFile_ != INVALID_HANDLE_VALUE);
@ -661,8 +661,8 @@ WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
assert(!options.use_mmap_reads); assert(!options.use_mmap_reads);
// Unbuffered access, use internal buffer for reads // Direct access, use internal buffer for reads
if (!file_base_->UseOSBuffer()) { if (file_base_->UseDirectIO()) {
// Do not allocate the buffer either until the first request or // Do not allocate the buffer either until the first request or
// until there is a call to allocate a read-ahead buffer // until there is a call to allocate a read-ahead buffer
buffer_.Alignment(alignment); buffer_.Alignment(alignment);
@ -683,11 +683,10 @@ Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
return s; return s;
} }
// When in unbuffered mode we need to do the following changes: // When in direct I/O mode we need to do the following changes:
// - use our own aligned buffer // - use our own aligned buffer
// - always read at the offset of that is a multiple of alignment // - always read at the offset of that is a multiple of alignment
if (!file_base_->UseOSBuffer()) { if (file_base_->UseDirectIO()) {
uint64_t first_page_start = 0; uint64_t first_page_start = 0;
size_t actual_bytes_toread = 0; size_t actual_bytes_toread = 0;
size_t bytes_requested = left; size_t bytes_requested = left;
@ -778,10 +777,8 @@ Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
inline inline
void WinRandomAccessImpl::HintImpl(RandomAccessFile::AccessPattern pattern) { void WinRandomAccessImpl::HintImpl(RandomAccessFile::AccessPattern pattern) {
if (pattern == RandomAccessFile::SEQUENTIAL && file_base_->UseDirectIO() &&
if (pattern == RandomAccessFile::SEQUENTIAL && compaction_readahead_size_ > 0) {
!file_base_->UseOSBuffer() &&
compaction_readahead_size_ > 0) {
std::lock_guard<std::mutex> lg(buffer_mut_); std::lock_guard<std::mutex> lg(buffer_mut_);
if (!read_ahead_) { if (!read_ahead_) {
read_ahead_ = true; read_ahead_ = true;
@ -798,11 +795,11 @@ void WinRandomAccessImpl::HintImpl(RandomAccessFile::AccessPattern pattern) {
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
/// WinRandomAccessFile /// WinRandomAccessFile
WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
const EnvOptions& options) : size_t alignment,
WinFileData(fname, hFile, options.use_os_buffer), const EnvOptions& options)
WinRandomAccessImpl(this, alignment, options) { : WinFileData(fname, hFile, options.use_direct_reads),
} WinRandomAccessImpl(this, alignment, options) {}
WinRandomAccessFile::~WinRandomAccessFile() { WinRandomAccessFile::~WinRandomAccessFile() {
} }
@ -851,7 +848,7 @@ WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment)
Status WinWritableImpl::AppendImpl(const Slice& data) { Status WinWritableImpl::AppendImpl(const Slice& data) {
// Used for buffered access ONLY // Used for buffered access ONLY
assert(file_data_->UseOSBuffer()); assert(!file_data_->UseDirectIO());
assert(data.size() < std::numeric_limits<DWORD>::max()); assert(data.size() < std::numeric_limits<DWORD>::max());
Status s; Status s;
@ -885,7 +882,7 @@ Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset)
} }
else { else {
assert(size_t(ret) == data.size()); assert(size_t(ret) == data.size());
// For sequential write this would be simple // For sequential write this would be simple
// size extension by data.size() // size extension by data.size()
uint64_t write_end = offset + data.size(); uint64_t write_end = offset + data.size();
if (write_end >= filesize_) { if (write_end >= filesize_) {
@ -934,9 +931,8 @@ Status WinWritableImpl::SyncImpl() {
// Calls flush buffers // Calls flush buffers
if (fsync(file_data_->GetFileHandle()) < 0) { if (fsync(file_data_->GetFileHandle()) < 0) {
auto lastError = GetLastError(); auto lastError = GetLastError();
s = IOErrorFromWindowsError("fsync failed at Sync() for: " + s = IOErrorFromWindowsError(
file_data_->GetName(), "fsync failed at Sync() for: " + file_data_->GetName(), lastError);
lastError);
} }
return s; return s;
} }
@ -967,21 +963,19 @@ Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// WinWritableFile /// WinWritableFile
WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
size_t /* capacity */, const EnvOptions& options) size_t alignment, size_t /* capacity */,
: WinFileData(fname, hFile, options.use_os_buffer), const EnvOptions& options)
WinWritableImpl(this, alignment) { : WinFileData(fname, hFile, options.use_direct_writes),
WinWritableImpl(this, alignment) {
assert(!options.use_mmap_writes); assert(!options.use_mmap_writes);
} }
WinWritableFile::~WinWritableFile() { WinWritableFile::~WinWritableFile() {
} }
// Indicates if the class makes use of unbuffered I/O // Indicates if the class makes use of direct I/O
bool WinWritableFile::UseOSBuffer() const { bool WinWritableFile::UseDirectIO() const { return WinFileData::UseDirectIO(); }
return WinFileData::UseOSBuffer();
}
size_t WinWritableFile::GetRequiredBufferAlignment() const { size_t WinWritableFile::GetRequiredBufferAlignment() const {
return GetAlignement(); return GetAlignement();
@ -1015,9 +1009,7 @@ Status WinWritableFile::Sync() {
return SyncImpl(); return SyncImpl();
} }
Status WinWritableFile::Fsync() { Status WinWritableFile::Fsync() { return SyncImpl(); }
return SyncImpl();
}
uint64_t WinWritableFile::GetFileSize() { uint64_t WinWritableFile::GetFileSize() {
return GetFileSizeImpl(); return GetFileSizeImpl();
@ -1034,17 +1026,14 @@ size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
/// WinRandomRWFile /// WinRandomRWFile
WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment, WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
const EnvOptions& options) : size_t alignment, const EnvOptions& options)
WinFileData(fname, hFile, options.use_os_buffer), : WinFileData(fname, hFile,
WinRandomAccessImpl(this, alignment, options), options.use_direct_reads && options.use_direct_writes),
WinWritableImpl(this, alignment) { WinRandomAccessImpl(this, alignment, options),
WinWritableImpl(this, alignment) {}
}
bool WinRandomRWFile::UseOSBuffer() const { bool WinRandomRWFile::UseDirectIO() const { return WinFileData::UseDirectIO(); }
return WinFileData::UseOSBuffer();
}
size_t WinRandomRWFile::GetRequiredBufferAlignment() const { size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
return GetAlignement(); return GetAlignement();
@ -1062,8 +1051,8 @@ Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) {
return PositionedAppendImpl(data, offset); return PositionedAppendImpl(data, offset);
} }
Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice * result, Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
char * scratch) const { char* scratch) const {
return ReadImpl(offset, n, result, scratch); return ReadImpl(offset, n, result, scratch);
} }
@ -1094,4 +1083,3 @@ WinFileLock::~WinFileLock() {
} }
} }

@ -8,17 +8,16 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once #pragma once
#include <rocksdb/Status.h> #include <stdint.h>
#include <rocksdb/env.h> #include <mutex>
#include <string>
#include "rocksdb/Status.h"
#include "rocksdb/env.h"
#include "util/aligned_buffer.h" #include "util/aligned_buffer.h"
#include <string>
#include <stdint.h>
#include <Windows.h> #include <Windows.h>
#include <mutex>
namespace rocksdb { namespace rocksdb {
namespace port { namespace port {
@ -26,9 +25,9 @@ namespace port {
std::string GetWindowsErrSz(DWORD err); std::string GetWindowsErrSz(DWORD err);
inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) { inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) {
return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL)) ? return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL))
Status::NoSpace(context, GetWindowsErrSz(err)) : ? Status::NoSpace(context, GetWindowsErrSz(err))
Status::IOError(context, GetWindowsErrSz(err)); : Status::IOError(context, GetWindowsErrSz(err));
} }
inline Status IOErrorFromLastWindowsError(const std::string& context) { inline Status IOErrorFromLastWindowsError(const std::string& context) {
@ -36,9 +35,9 @@ inline Status IOErrorFromLastWindowsError(const std::string& context) {
} }
inline Status IOError(const std::string& context, int err_number) { inline Status IOError(const std::string& context, int err_number) {
return (err_number == ENOSPC) ? return (err_number == ENOSPC)
Status::NoSpace(context, strerror(err_number)) : ? Status::NoSpace(context, strerror(err_number))
Status::IOError(context, strerror(err_number)); : Status::IOError(context, strerror(err_number));
} }
// Note the below two do not set errno because they are used only here in this // Note the below two do not set errno because they are used only here in this
@ -54,49 +53,34 @@ inline int fsync(HANDLE hFile) {
return 0; return 0;
} }
SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes, SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes, uint64_t offset);
uint64_t offset);
SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset); SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset);
Status fallocate(const std::string& filename, HANDLE hFile, Status fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size);
uint64_t to_size);
Status ftruncate(const std::string& filename, HANDLE hFile,
uint64_t toSize);
Status ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize);
size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size); size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size);
class WinFileData { class WinFileData {
protected: protected:
const std::string filename_; const std::string filename_;
HANDLE hFile_; HANDLE hFile_;
// There is no equivalent of advising away buffered pages as in posix. // If ture, the I/O issued would be direct I/O which the buffer
// To implement this flag we would need to do unbuffered reads which
// will need to be aligned (not sure there is a guarantee that the buffer // will need to be aligned (not sure there is a guarantee that the buffer
// passed in is aligned). // passed in is aligned).
// Hence we currently ignore this flag. It is used only in a few cases const bool use_direct_io_;
// which should not be perf critical.
// If perf evaluation finds this to be a problem, we can look into
// implementing this.
const bool use_os_buffer_;
public:
public:
// We want this class be usable both for inheritance (prive // We want this class be usable both for inheritance (prive
// or protected) and for containment so __ctor and __dtor public // or protected) and for containment so __ctor and __dtor public
WinFileData(const std::string& filename, HANDLE hFile, bool use_os_buffer) : WinFileData(const std::string& filename, HANDLE hFile, bool use_direct_io)
filename_(filename), hFile_(hFile), use_os_buffer_(use_os_buffer) : filename_(filename), hFile_(hFile), use_direct_io_(use_direct_io) {}
{}
virtual ~WinFileData() { virtual ~WinFileData() { this->CloseFile(); }
this->CloseFile();
}
bool CloseFile() { bool CloseFile() {
bool result = true; bool result = true;
if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) { if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) {
@ -111,13 +95,12 @@ public:
HANDLE GetFileHandle() const { return hFile_; } HANDLE GetFileHandle() const { return hFile_; }
bool UseOSBuffer() const { return use_os_buffer_; } bool UseDirectIO() const { return use_direct_io_; }
WinFileData(const WinFileData&) = delete; WinFileData(const WinFileData&) = delete;
WinFileData& operator=(const WinFileData&) = delete; WinFileData& operator=(const WinFileData&) = delete;
}; };
// mmap() based random-access // mmap() based random-access
class WinMmapReadableFile : private WinFileData, public RandomAccessFile { class WinMmapReadableFile : private WinFileData, public RandomAccessFile {
HANDLE hMap_; HANDLE hMap_;
@ -125,10 +108,10 @@ class WinMmapReadableFile : private WinFileData, public RandomAccessFile {
const void* mapped_region_; const void* mapped_region_;
const size_t length_; const size_t length_;
public: public:
// mapped_region_[0,length-1] contains the mmapped contents of the file. // mapped_region_[0,length-1] contains the mmapped contents of the file.
WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap, WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap,
const void* mapped_region, size_t length); const void* mapped_region, size_t length);
~WinMmapReadableFile(); ~WinMmapReadableFile();
@ -136,7 +119,7 @@ public:
WinMmapReadableFile& operator=(const WinMmapReadableFile&) = delete; WinMmapReadableFile& operator=(const WinMmapReadableFile&) = delete;
virtual Status Read(uint64_t offset, size_t n, Slice* result, virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override; char* scratch) const override;
virtual Status InvalidateCache(size_t offset, size_t length) override; virtual Status InvalidateCache(size_t offset, size_t length) override;
@ -148,20 +131,20 @@ public:
// file before reading from it, or for log files, the reading code // file before reading from it, or for log files, the reading code
// knows enough to skip zero suffixes. // knows enough to skip zero suffixes.
class WinMmapFile : private WinFileData, public WritableFile { class WinMmapFile : private WinFileData, public WritableFile {
private: private:
HANDLE hMap_; HANDLE hMap_;
const size_t page_size_; // We flush the mapping view in page_size const size_t page_size_; // We flush the mapping view in page_size
// increments. We may decide if this is a memory // increments. We may decide if this is a memory
// page size or SSD page size // page size or SSD page size
const size_t const size_t
allocation_granularity_; // View must start at such a granularity allocation_granularity_; // View must start at such a granularity
size_t reserved_size_; // Preallocated size size_t reserved_size_; // Preallocated size
size_t mapping_size_; // The max size of the mapping object size_t mapping_size_; // The max size of the mapping object
// we want to guess the final file size to minimize the remapping // we want to guess the final file size to minimize the remapping
size_t view_size_; // How much memory to map into a view at a time size_t view_size_; // How much memory to map into a view at a time
char* mapped_begin_; // Must begin at the file offset that is aligned with char* mapped_begin_; // Must begin at the file offset that is aligned with
// allocation_granularity_ // allocation_granularity_
@ -184,10 +167,9 @@ private:
virtual Status PreallocateInternal(uint64_t spaceToReserve); virtual Status PreallocateInternal(uint64_t spaceToReserve);
public: public:
WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size, WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
size_t allocation_granularity, const EnvOptions& options); size_t allocation_granularity, const EnvOptions& options);
~WinMmapFile(); ~WinMmapFile();
@ -227,9 +209,9 @@ public:
}; };
class WinSequentialFile : private WinFileData, public SequentialFile { class WinSequentialFile : private WinFileData, public SequentialFile {
public: public:
WinSequentialFile(const std::string& fname, HANDLE f, WinSequentialFile(const std::string& fname, HANDLE f,
const EnvOptions& options); const EnvOptions& options);
~WinSequentialFile(); ~WinSequentialFile();
@ -244,89 +226,87 @@ public:
}; };
class WinRandomAccessImpl { class WinRandomAccessImpl {
protected: protected:
WinFileData* file_base_; WinFileData* file_base_;
bool read_ahead_; bool read_ahead_;
const size_t compaction_readahead_size_; const size_t compaction_readahead_size_;
const size_t random_access_max_buffer_size_; const size_t random_access_max_buffer_size_;
mutable std::mutex buffer_mut_; mutable std::mutex buffer_mut_;
mutable AlignedBuffer buffer_; mutable AlignedBuffer buffer_;
mutable uint64_t mutable uint64_t
buffered_start_; // file offset set that is currently buffered buffered_start_; // file offset set that is currently buffered
// Override for behavior change when creating a custom env // Override for behavior change when creating a custom env
virtual SSIZE_T PositionedReadInternal(char* src, size_t numBytes, virtual SSIZE_T PositionedReadInternal(char* src, size_t numBytes,
uint64_t offset) const; uint64_t offset) const;
/* /*
* The function reads a requested amount of bytes into the specified aligned * The function reads a requested amount of bytes into the specified aligned
* buffer Upon success the function sets the length of the buffer to the * buffer Upon success the function sets the length of the buffer to the
* amount of bytes actually read even though it might be less than actually * amount of bytes actually read even though it might be less than actually
* requested. It then copies the amount of bytes requested by the user (left) * requested. It then copies the amount of bytes requested by the user (left)
* to the user supplied buffer (dest) and reduces left by the amount of bytes * to the user supplied buffer (dest) and reduces left by the amount of bytes
* copied to the user buffer * copied to the user buffer
* *
* @user_offset [in] - offset on disk where the read was requested by the user * @user_offset [in] - offset on disk where the read was requested by the user
* @first_page_start [in] - actual page aligned disk offset that we want to * @first_page_start [in] - actual page aligned disk offset that we want to
* read from * read from
* @bytes_to_read [in] - total amount of bytes that will be read from disk * @bytes_to_read [in] - total amount of bytes that will be read from disk
* which is generally greater or equal to the amount * which is generally greater or equal to the amount
* that the user has requested due to the * that the user has requested due to the
* either alignment requirements or read_ahead in * either alignment requirements or read_ahead in
* effect. * effect.
* @left [in/out] total amount of bytes that needs to be copied to the user * @left [in/out] total amount of bytes that needs to be copied to the user
* buffer. It is reduced by the amount of bytes that actually * buffer. It is reduced by the amount of bytes that actually
* copied * copied
* @buffer - buffer to use * @buffer - buffer to use
* @dest - user supplied buffer * @dest - user supplied buffer
*/ */
SSIZE_T ReadIntoBuffer(uint64_t user_offset, uint64_t first_page_start, SSIZE_T ReadIntoBuffer(uint64_t user_offset, uint64_t first_page_start,
size_t bytes_to_read, size_t& left, size_t bytes_to_read, size_t& left,
AlignedBuffer& buffer, char* dest) const; AlignedBuffer& buffer, char* dest) const;
SSIZE_T ReadIntoOneShotBuffer(uint64_t user_offset, uint64_t first_page_start, SSIZE_T ReadIntoOneShotBuffer(uint64_t user_offset, uint64_t first_page_start,
size_t bytes_to_read, size_t& left, size_t bytes_to_read, size_t& left,
char* dest) const; char* dest) const;
SSIZE_T ReadIntoInstanceBuffer(uint64_t user_offset, SSIZE_T ReadIntoInstanceBuffer(uint64_t user_offset,
uint64_t first_page_start, uint64_t first_page_start,
size_t bytes_to_read, size_t& left, size_t bytes_to_read, size_t& left,
char* dest) const; char* dest) const;
WinRandomAccessImpl(WinFileData* file_base, size_t alignment, WinRandomAccessImpl(WinFileData* file_base, size_t alignment,
const EnvOptions& options); const EnvOptions& options);
virtual ~WinRandomAccessImpl() {} virtual ~WinRandomAccessImpl() {}
public: public:
WinRandomAccessImpl(const WinRandomAccessImpl&) = delete; WinRandomAccessImpl(const WinRandomAccessImpl&) = delete;
WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete; WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete;
Status ReadImpl(uint64_t offset, size_t n, Slice* result, Status ReadImpl(uint64_t offset, size_t n, Slice* result,
char* scratch) const; char* scratch) const;
void HintImpl(RandomAccessFile::AccessPattern pattern); void HintImpl(RandomAccessFile::AccessPattern pattern);
}; };
// pread() based random-access // pread() based random-access
class WinRandomAccessFile : private WinFileData, class WinRandomAccessFile
protected WinRandomAccessImpl, // Want to be able to override PositionedReadInternal : private WinFileData,
public RandomAccessFile { protected WinRandomAccessImpl, // Want to be able to override
// PositionedReadInternal
public: public RandomAccessFile {
public:
WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment,
const EnvOptions& options); const EnvOptions& options);
~WinRandomAccessFile(); ~WinRandomAccessFile();
virtual void EnableReadAhead() override; virtual void EnableReadAhead() override;
virtual Status Read(uint64_t offset, size_t n, Slice* result, virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override; char* scratch) const override;
virtual bool ShouldForwardRawRequest() const override; virtual bool ShouldForwardRawRequest() const override;
@ -337,7 +317,6 @@ public:
virtual size_t GetUniqueId(char* id, size_t max_size) const override; virtual size_t GetUniqueId(char* id, size_t max_size) const override;
}; };
// This is a sequential write class. It has been mimicked (as others) after // This is a sequential write class. It has been mimicked (as others) after
// the original Posix class. We add support for unbuffered I/O on windows as // the original Posix class. We add support for unbuffered I/O on windows as
// well // well
@ -351,12 +330,11 @@ public:
// No padding is required for // No padding is required for
// buffered access. // buffered access.
class WinWritableImpl { class WinWritableImpl {
protected: protected:
WinFileData* file_data_;
WinFileData* file_data_; const uint64_t alignment_;
const uint64_t alignment_; uint64_t filesize_; // How much data is actually written disk
uint64_t filesize_; // How much data is actually written disk uint64_t reservedsize_; // how far we have reserved space
uint64_t reservedsize_; // how far we have reserved space
virtual Status PreallocateInternal(uint64_t spaceToReserve); virtual Status PreallocateInternal(uint64_t spaceToReserve);
@ -368,7 +346,8 @@ protected:
Status AppendImpl(const Slice& data); Status AppendImpl(const Slice& data);
// Requires that the data is aligned as specified by GetRequiredBufferAlignment() // Requires that the data is aligned as specified by
// GetRequiredBufferAlignment()
Status PositionedAppendImpl(const Slice& data, uint64_t offset); Status PositionedAppendImpl(const Slice& data, uint64_t offset);
Status TruncateImpl(uint64_t size); Status TruncateImpl(uint64_t size);
@ -380,7 +359,8 @@ protected:
uint64_t GetFileSizeImpl() { uint64_t GetFileSizeImpl() {
// Double accounting now here with WritableFileWriter // Double accounting now here with WritableFileWriter
// and this size will be wrong when unbuffered access is used // and this size will be wrong when unbuffered access is used
// but tests implement their own writable files and do not use WritableFileWrapper // but tests implement their own writable files and do not use
// WritableFileWrapper
// so we need to squeeze a square peg through // so we need to squeeze a square peg through
// a round hole here. // a round hole here.
return filesize_; return filesize_;
@ -388,32 +368,30 @@ protected:
Status AllocateImpl(uint64_t offset, uint64_t len); Status AllocateImpl(uint64_t offset, uint64_t len);
public: public:
WinWritableImpl(const WinWritableImpl&) = delete; WinWritableImpl(const WinWritableImpl&) = delete;
WinWritableImpl& operator=(const WinWritableImpl&) = delete; WinWritableImpl& operator=(const WinWritableImpl&) = delete;
}; };
class WinWritableFile : private WinFileData, class WinWritableFile : private WinFileData,
protected WinWritableImpl, protected WinWritableImpl,
public WritableFile { public WritableFile {
public:
public:
WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment,
size_t capacity, const EnvOptions& options); size_t capacity, const EnvOptions& options);
~WinWritableFile(); ~WinWritableFile();
// Indicates if the class makes use of unbuffered I/O // Indicates if the class makes use of direct I/O
// Use PositionedAppend // Use PositionedAppend
virtual bool UseOSBuffer() const override; virtual bool UseDirectIO() const override;
virtual size_t GetRequiredBufferAlignment() const override; virtual size_t GetRequiredBufferAlignment() const override;
virtual Status Append(const Slice& data) override; virtual Status Append(const Slice& data) override;
// Requires that the data is aligned as specified by GetRequiredBufferAlignment() // Requires that the data is aligned as specified by
// GetRequiredBufferAlignment()
virtual Status PositionedAppend(const Slice& data, uint64_t offset) override; virtual Status PositionedAppend(const Slice& data, uint64_t offset) override;
// Need to implement this so the file is truncated correctly // Need to implement this so the file is truncated correctly
@ -437,30 +415,27 @@ public:
virtual size_t GetUniqueId(char* id, size_t max_size) const override; virtual size_t GetUniqueId(char* id, size_t max_size) const override;
}; };
class WinRandomRWFile : private WinFileData, class WinRandomRWFile : private WinFileData,
protected WinRandomAccessImpl, protected WinRandomAccessImpl,
protected WinWritableImpl, protected WinWritableImpl,
public RandomRWFile { public RandomRWFile {
public:
public:
WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment, WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment,
const EnvOptions& options); const EnvOptions& options);
~WinRandomRWFile() {} ~WinRandomRWFile() {}
// Indicates if the class makes use of unbuffered I/O // Indicates if the class makes use of direct I/O
// If false you must pass aligned buffer to Write() // If false you must pass aligned buffer to Write()
virtual bool UseOSBuffer() const override; virtual bool UseDirectIO() const override;
// Use the returned alignment value to allocate // Use the returned alignment value to allocate aligned
// aligned buffer for Write() when UseOSBuffer() // buffer for Write() when UseDirectIO() returns true
// returns false
virtual size_t GetRequiredBufferAlignment() const override; virtual size_t GetRequiredBufferAlignment() const override;
// Used by the file_reader_writer to decide if the ReadAhead wrapper // Used by the file_reader_writer to decide if the ReadAhead wrapper
// should simply forward the call and do not enact read_ahead buffering or locking. // should simply forward the call and do not enact read_ahead buffering or
// locking.
// The implementation below takes care of reading ahead // The implementation below takes care of reading ahead
virtual bool ShouldForwardRawRequest() const override; virtual bool ShouldForwardRawRequest() const override;
@ -469,14 +444,14 @@ public:
virtual void EnableReadAhead() override; virtual void EnableReadAhead() override;
// Write bytes in `data` at offset `offset`, Returns Status::OK() on success. // Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
// Pass aligned buffer when UseOSBuffer() returns false. // Pass aligned buffer when UseDirectIO() returns true.
virtual Status Write(uint64_t offset, const Slice& data) override; virtual Status Write(uint64_t offset, const Slice& data) override;
// Read up to `n` bytes starting from offset `offset` and store them in // Read up to `n` bytes starting from offset `offset` and store them in
// result, provided `scratch` size should be at least `n`. // result, provided `scratch` size should be at least `n`.
// Returns Status::OK() on success. // Returns Status::OK() on success.
virtual Status Read(uint64_t offset, size_t n, Slice* result, virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override; char* scratch) const override;
virtual Status Flush() override; virtual Status Flush() override;
@ -487,16 +462,15 @@ public:
virtual Status Close() override; virtual Status Close() override;
}; };
class WinDirectory : public Directory { class WinDirectory : public Directory {
public: public:
WinDirectory() {} WinDirectory() {}
virtual Status Fsync() override; virtual Status Fsync() override;
}; };
class WinFileLock : public FileLock { class WinFileLock : public FileLock {
public: public:
explicit WinFileLock(HANDLE hFile) : hFile_(hFile) { explicit WinFileLock(HANDLE hFile) : hFile_(hFile) {
assert(hFile != NULL); assert(hFile != NULL);
assert(hFile != INVALID_HANDLE_VALUE); assert(hFile != INVALID_HANDLE_VALUE);
@ -504,9 +478,8 @@ public:
~WinFileLock(); ~WinFileLock();
private: private:
HANDLE hFile_; HANDLE hFile_;
}; };
} }
} }

@ -771,9 +771,6 @@ DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files"
" in MB."); " in MB.");
DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size"); DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
DEFINE_bool(bufferedio, rocksdb::EnvOptions().use_os_buffer,
"Allow buffered io using OS buffers");
DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads, DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads,
"Allow reads to occur via mmap-ing files"); "Allow reads to occur via mmap-ing files");
@ -783,6 +780,9 @@ DEFINE_bool(mmap_write, rocksdb::EnvOptions().use_mmap_writes,
DEFINE_bool(use_direct_reads, rocksdb::EnvOptions().use_direct_reads, DEFINE_bool(use_direct_reads, rocksdb::EnvOptions().use_direct_reads,
"Use O_DIRECT for reading data"); "Use O_DIRECT for reading data");
DEFINE_bool(use_direct_writes, rocksdb::EnvOptions().use_direct_writes,
"Use O_DIRECT for writing data");
DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open, DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open,
"Advise random access on table file open"); "Advise random access on table file open");
@ -2750,6 +2750,7 @@ class Benchmark {
options.allow_mmap_reads = FLAGS_mmap_read; options.allow_mmap_reads = FLAGS_mmap_read;
options.allow_mmap_writes = FLAGS_mmap_write; options.allow_mmap_writes = FLAGS_mmap_write;
options.use_direct_reads = FLAGS_use_direct_reads; options.use_direct_reads = FLAGS_use_direct_reads;
options.use_direct_writes = FLAGS_use_direct_writes;
if (FLAGS_prefix_size != 0) { if (FLAGS_prefix_size != 0) {
options.prefix_extractor.reset( options.prefix_extractor.reset(
NewFixedPrefixTransform(FLAGS_prefix_size)); NewFixedPrefixTransform(FLAGS_prefix_size));
@ -2951,9 +2952,6 @@ class Benchmark {
options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits; options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
// fill storage options // fill storage options
options.allow_os_buffer = FLAGS_bufferedio;
options.allow_mmap_reads = FLAGS_mmap_read;
options.allow_mmap_writes = FLAGS_mmap_write;
options.advise_random_on_open = FLAGS_advise_random_on_open; options.advise_random_on_open = FLAGS_advise_random_on_open;
options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e; options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
options.use_adaptive_mutex = FLAGS_use_adaptive_mutex; options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;

@ -208,13 +208,14 @@ const std::string options_file_content = R"OPTIONS_FILE(
max_background_flushes=1 max_background_flushes=1
create_if_missing=true create_if_missing=true
error_if_exists=false error_if_exists=false
allow_os_buffer=true
delayed_write_rate=1048576 delayed_write_rate=1048576
manifest_preallocation_size=4194304 manifest_preallocation_size=4194304
allow_mmap_reads=false
allow_mmap_writes=false allow_mmap_writes=false
use_direct_reads=false
use_direct_writes=false
stats_dump_period_sec=600 stats_dump_period_sec=600
allow_fallocate=true allow_fallocate=true
allow_mmap_reads=false
max_log_file_size=83886080 max_log_file_size=83886080
random_access_max_buffer_size=1048576 random_access_max_buffer_size=1048576
advise_random_on_open=true advise_random_on_open=true

@ -24,7 +24,7 @@ inline size_t Roundup(size_t x, size_t y) {
} }
// This class is to manage an aligned user // This class is to manage an aligned user
// allocated buffer for unbuffered I/O purposes // allocated buffer for direct I/O purposes
// though can be used for any purpose. // though can be used for any purpose.
class AlignedBuffer { class AlignedBuffer {
size_t alignment_; size_t alignment_;

@ -50,10 +50,10 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
wal_ttl_seconds(options.WAL_ttl_seconds), wal_ttl_seconds(options.WAL_ttl_seconds),
wal_size_limit_mb(options.WAL_size_limit_MB), wal_size_limit_mb(options.WAL_size_limit_MB),
manifest_preallocation_size(options.manifest_preallocation_size), manifest_preallocation_size(options.manifest_preallocation_size),
allow_os_buffer(options.allow_os_buffer),
allow_mmap_reads(options.allow_mmap_reads), allow_mmap_reads(options.allow_mmap_reads),
allow_mmap_writes(options.allow_mmap_writes), allow_mmap_writes(options.allow_mmap_writes),
use_direct_reads(options.use_direct_reads), use_direct_reads(options.use_direct_reads),
use_direct_writes(options.use_direct_writes),
allow_fallocate(options.allow_fallocate), allow_fallocate(options.allow_fallocate),
is_fd_close_on_exec(options.is_fd_close_on_exec), is_fd_close_on_exec(options.is_fd_close_on_exec),
stats_dump_period_sec(options.stats_dump_period_sec), stats_dump_period_sec(options.stats_dump_period_sec),
@ -119,16 +119,16 @@ void ImmutableDBOptions::Dump(Logger* log) const {
Header(log, Header(log,
" Options.recycle_log_file_num: %" ROCKSDB_PRIszt, " Options.recycle_log_file_num: %" ROCKSDB_PRIszt,
recycle_log_file_num); recycle_log_file_num);
Header(log, " Options.allow_os_buffer: %d",
allow_os_buffer);
Header(log, " Options.allow_mmap_reads: %d",
allow_mmap_reads);
Header(log, " Options.allow_fallocate: %d", Header(log, " Options.allow_fallocate: %d",
allow_fallocate); allow_fallocate);
Header(log, " Options.allow_mmap_reads: %d",
allow_mmap_reads);
Header(log, " Options.allow_mmap_writes: %d", Header(log, " Options.allow_mmap_writes: %d",
allow_mmap_writes); allow_mmap_writes);
Header(log, " Options.use_direct_reads: %d", Header(log, " Options.use_direct_reads: %d",
use_direct_reads); use_direct_reads);
Header(log, " Options.use_direct_writes: %d",
use_direct_writes);
Header(log, " Options.create_missing_column_families: %d", Header(log, " Options.create_missing_column_families: %d",
create_missing_column_families); create_missing_column_families);
Header(log, " Options.db_log_dir: %s", Header(log, " Options.db_log_dir: %s",
@ -148,12 +148,6 @@ void ImmutableDBOptions::Dump(Logger* log) const {
Header(log, Header(log,
" Options.manifest_preallocation_size: %" ROCKSDB_PRIszt, " Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
manifest_preallocation_size); manifest_preallocation_size);
Header(log, " Options.allow_os_buffer: %d",
allow_os_buffer);
Header(log, " Options.allow_mmap_reads: %d",
allow_mmap_reads);
Header(log, " Options.allow_mmap_writes: %d",
allow_mmap_writes);
Header(log, " Options.is_fd_close_on_exec: %d", Header(log, " Options.is_fd_close_on_exec: %d",
is_fd_close_on_exec); is_fd_close_on_exec);
Header(log, " Options.stats_dump_period_sec: %u", Header(log, " Options.stats_dump_period_sec: %u",

@ -46,10 +46,10 @@ struct ImmutableDBOptions {
uint64_t wal_ttl_seconds; uint64_t wal_ttl_seconds;
uint64_t wal_size_limit_mb; uint64_t wal_size_limit_mb;
size_t manifest_preallocation_size; size_t manifest_preallocation_size;
bool allow_os_buffer;
bool allow_mmap_reads; bool allow_mmap_reads;
bool allow_mmap_writes; bool allow_mmap_writes;
bool use_direct_reads; bool use_direct_reads;
bool use_direct_writes;
bool allow_fallocate; bool allow_fallocate;
bool is_fd_close_on_exec; bool is_fd_close_on_exec;
unsigned int stats_dump_period_sec; unsigned int stats_dump_period_sec;

@ -313,10 +313,10 @@ EnvWrapper::~EnvWrapper() {
namespace { // anonymous namespace namespace { // anonymous namespace
void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) { void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
env_options->use_os_buffer = options.allow_os_buffer;
env_options->use_mmap_reads = options.allow_mmap_reads; env_options->use_mmap_reads = options.allow_mmap_reads;
env_options->use_mmap_writes = options.allow_mmap_writes; env_options->use_mmap_writes = options.allow_mmap_writes;
env_options->use_direct_reads = options.use_direct_reads; env_options->use_direct_reads = options.use_direct_reads;
env_options->use_direct_writes = options.use_direct_writes;
env_options->set_fd_cloexec = options.is_fd_close_on_exec; env_options->set_fd_cloexec = options.is_fd_close_on_exec;
env_options->bytes_per_sync = options.bytes_per_sync; env_options->bytes_per_sync = options.bytes_per_sync;
env_options->compaction_readahead_size = options.compaction_readahead_size; env_options->compaction_readahead_size = options.compaction_readahead_size;

@ -38,6 +38,7 @@
#endif #endif
#include <deque> #include <deque>
#include <set> #include <set>
#include <vector>
#include "port/port.h" #include "port/port.h"
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
#include "util/coding.h" #include "util/coding.h"
@ -250,63 +251,65 @@ class PosixEnv : public Env {
result->reset(); result->reset();
Status s; Status s;
int fd = -1; int fd = -1;
int flags = O_CREAT | O_TRUNC;
// Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
if (options.use_direct_writes && !options.use_mmap_writes) {
// Note: we should avoid O_APPEND here due to ta the following bug:
// POSIX requires that opening a file with the O_APPEND flag should
// have no affect on the location at which pwrite() writes data.
// However, on Linux, if a file is opened with O_APPEND, pwrite()
// appends data to the end of the file, regardless of the value of
// offset.
// More info here: https://linux.die.net/man/2/pwrite
flags |= O_WRONLY;
#ifndef OS_MACOSX
flags |= O_DIRECT;
#endif
TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
} else if (options.use_mmap_writes) {
// non-direct I/O
flags |= O_RDWR;
} else {
flags |= O_WRONLY;
}
do { do {
IOSTATS_TIMER_GUARD(open_nanos); IOSTATS_TIMER_GUARD(open_nanos);
fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); fd = open(fname.c_str(), flags, 0644);
} while (fd < 0 && errno == EINTR); } while (fd < 0 && errno == EINTR);
if (fd < 0) { if (fd < 0) {
s = IOError(fname, errno); s = IOError(fname, errno);
} else { return s;
SetFD_CLOEXEC(fd, &options); }
if (options.use_mmap_writes) { SetFD_CLOEXEC(fd, &options);
if (!checkedDiskForMmap_) {
// this will be executed once in the program's lifetime. if (options.use_mmap_writes) {
// do not use mmapWrite on non ext-3/xfs/tmpfs systems. if (!checkedDiskForMmap_) {
if (!SupportsFastAllocate(fname)) { // this will be executed once in the program's lifetime.
forceMmapOff = true; // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
} if (!SupportsFastAllocate(fname)) {
checkedDiskForMmap_ = true; forceMmapOff_ = true;
} }
checkedDiskForMmap_ = true;
} }
if (options.use_mmap_writes && !forceMmapOff) { }
result->reset(new PosixMmapFile(fname, fd, page_size_, options)); if (options.use_mmap_writes && !forceMmapOff_) {
} else if (options.use_direct_writes) { result->reset(new PosixMmapFile(fname, fd, page_size_, options));
close(fd); } else if (options.use_direct_writes && !options.use_mmap_writes) {
#ifdef OS_MACOSX
int flags = O_WRONLY | O_APPEND | O_TRUNC | O_CREAT;
#else
// Note: we should avoid O_APPEND here due to ta the following bug:
// POSIX requires that opening a file with the O_APPEND flag should
// have no affect on the location at which pwrite() writes data.
// However, on Linux, if a file is opened with O_APPEND, pwrite()
// appends data to the end of the file, regardless of the value of
// offset.
// More info here: https://linux.die.net/man/2/pwrite
int flags = O_WRONLY | O_TRUNC | O_CREAT | O_DIRECT;
#endif
TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
fd = open(fname.c_str(), flags, 0644);
if (fd < 0) {
s = IOError(fname, errno);
} else {
std::unique_ptr<PosixDirectIOWritableFile> file(
new PosixDirectIOWritableFile(fname, fd));
*result = std::move(file);
s = Status::OK();
#ifdef OS_MACOSX #ifdef OS_MACOSX
if (fcntl(fd, F_NOCACHE, 1) == -1) { if (fcntl(fd, F_NOCACHE, 1) == -1) {
close(fd); close(fd);
s = IOError(fname, errno); s = IOError(fname, errno);
} return s;
#endif
}
} else {
// disable mmap writes
EnvOptions no_mmap_writes_options = options;
no_mmap_writes_options.use_mmap_writes = false;
result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
} }
#endif
result->reset(new PosixWritableFile(fname, fd, options));
} else {
// disable mmap writes
EnvOptions no_mmap_writes_options = options;
no_mmap_writes_options.use_mmap_writes = false;
result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
} }
return s; return s;
} }
@ -318,41 +321,69 @@ class PosixEnv : public Env {
result->reset(); result->reset();
Status s; Status s;
int fd = -1; int fd = -1;
int flags = 0;
// Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
if (options.use_direct_writes && !options.use_mmap_writes) {
flags |= O_WRONLY;
#ifndef OS_MACOSX
flags |= O_DIRECT;
#endif
TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
} else if (options.use_mmap_writes) {
// mmap needs O_RDWR mode
flags |= O_RDWR;
} else {
flags |= O_WRONLY;
}
do { do {
IOSTATS_TIMER_GUARD(open_nanos); IOSTATS_TIMER_GUARD(open_nanos);
fd = open(old_fname.c_str(), O_RDWR, 0644); fd = open(old_fname.c_str(), flags, 0644);
} while (fd < 0 && errno == EINTR); } while (fd < 0 && errno == EINTR);
if (fd < 0) { if (fd < 0) {
s = IOError(fname, errno); s = IOError(fname, errno);
} else { return s;
SetFD_CLOEXEC(fd, &options); }
// rename into place
if (rename(old_fname.c_str(), fname.c_str()) != 0) { SetFD_CLOEXEC(fd, &options);
Status r = IOError(old_fname, errno); // rename into place
close(fd); if (rename(old_fname.c_str(), fname.c_str()) != 0) {
return r; s = IOError(old_fname, errno);
} close(fd);
if (options.use_mmap_writes) { return s;
if (!checkedDiskForMmap_) { }
// this will be executed once in the program's lifetime.
// do not use mmapWrite on non ext-3/xfs/tmpfs systems. if (options.use_mmap_writes) {
if (!SupportsFastAllocate(fname)) { if (!checkedDiskForMmap_) {
forceMmapOff = true; // this will be executed once in the program's lifetime.
} // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
checkedDiskForMmap_ = true; if (!SupportsFastAllocate(fname)) {
forceMmapOff_ = true;
} }
checkedDiskForMmap_ = true;
} }
if (options.use_mmap_writes && !forceMmapOff) { }
result->reset(new PosixMmapFile(fname, fd, page_size_, options)); if (options.use_mmap_writes && !forceMmapOff_) {
} else { result->reset(new PosixMmapFile(fname, fd, page_size_, options));
// disable mmap writes } else if (options.use_direct_writes && !options.use_mmap_writes) {
EnvOptions no_mmap_writes_options = options; #ifdef OS_MACOSX
no_mmap_writes_options.use_mmap_writes = false; if (fcntl(fd, F_NOCACHE, 1) == -1) {
close(fd);
result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options)); s = IOError(fname, errno);
return s;
} }
#endif
result->reset(new PosixWritableFile(fname, fd, options));
} else {
// disable mmap writes
EnvOptions no_mmap_writes_options = options;
no_mmap_writes_options.use_mmap_writes = false;
result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
} }
return s; return s;
return s;
} }
virtual Status NewRandomRWFile(const std::string& fname, virtual Status NewRandomRWFile(const std::string& fname,
@ -724,6 +755,7 @@ class PosixEnv : public Env {
const DBOptions& db_options) const override { const DBOptions& db_options) const override {
EnvOptions optimized = env_options; EnvOptions optimized = env_options;
optimized.use_mmap_writes = false; optimized.use_mmap_writes = false;
optimized.use_direct_writes = false;
optimized.bytes_per_sync = db_options.wal_bytes_per_sync; optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
// TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
// breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
@ -736,14 +768,14 @@ class PosixEnv : public Env {
const EnvOptions& env_options) const override { const EnvOptions& env_options) const override {
EnvOptions optimized = env_options; EnvOptions optimized = env_options;
optimized.use_mmap_writes = false; optimized.use_mmap_writes = false;
optimized.use_direct_writes = false;
optimized.fallocate_with_keep_size = true; optimized.fallocate_with_keep_size = true;
return optimized; return optimized;
} }
private: private:
bool checkedDiskForMmap_; bool checkedDiskForMmap_;
bool forceMmapOff; // do we override Env options? bool forceMmapOff_; // do we override Env options?
// Returns true iff the named directory exists and is a directory. // Returns true iff the named directory exists and is a directory.
virtual bool DirExists(const std::string& dname) { virtual bool DirExists(const std::string& dname) {
@ -784,7 +816,7 @@ class PosixEnv : public Env {
PosixEnv::PosixEnv() PosixEnv::PosixEnv()
: checkedDiskForMmap_(false), : checkedDiskForMmap_(false),
forceMmapOff(false), forceMmapOff_(false),
page_size_(getpagesize()), page_size_(getpagesize()),
thread_pools_(Priority::TOTAL) { thread_pools_(Priority::TOTAL) {
ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));

@ -61,9 +61,8 @@ Status WritableFileWriter::Append(const Slice& data) {
writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left); writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left);
} }
// Flush only when I/O is buffered // Flush only when buffered I/O
if (use_os_buffer_ && if (!direct_io_ && (buf_.Capacity() - buf_.CurrentSize()) < left) {
(buf_.Capacity() - buf_.CurrentSize()) < left) {
if (buf_.CurrentSize() > 0) { if (buf_.CurrentSize() > 0) {
s = Flush(); s = Flush();
if (!s.ok()) { if (!s.ok()) {
@ -79,10 +78,10 @@ Status WritableFileWriter::Append(const Slice& data) {
assert(buf_.CurrentSize() == 0); assert(buf_.CurrentSize() == 0);
} }
// We never write directly to disk with unbuffered I/O on. // We never write directly to disk with direct I/O on.
// or we simply use it for its original purpose to accumulate many small // or we simply use it for its original purpose to accumulate many small
// chunks // chunks
if (!use_os_buffer_ || (buf_.Capacity() >= left)) { if (direct_io_ || (buf_.Capacity() >= left)) {
while (left > 0) { while (left > 0) {
size_t appended = buf_.Append(src, left); size_t appended = buf_.Append(src, left);
left -= appended; left -= appended;
@ -96,7 +95,7 @@ Status WritableFileWriter::Append(const Slice& data) {
// We double the buffer here because // We double the buffer here because
// Flush calls do not keep up with the incoming bytes // Flush calls do not keep up with the incoming bytes
// This is the only place when buffer is changed with unbuffered I/O // This is the only place when buffer is changed with direct I/O
if (buf_.Capacity() < max_buffer_size_) { if (buf_.Capacity() < max_buffer_size_) {
size_t desiredCapacity = buf_.Capacity() * 2; size_t desiredCapacity = buf_.Capacity() * 2;
desiredCapacity = std::min(desiredCapacity, max_buffer_size_); desiredCapacity = std::min(desiredCapacity, max_buffer_size_);
@ -132,7 +131,7 @@ Status WritableFileWriter::Close() {
s = Flush(); // flush cache to OS s = Flush(); // flush cache to OS
// In unbuffered mode we write whole pages so // In direct I/O mode we write whole pages so
// we need to let the file know where data ends. // we need to let the file know where data ends.
Status interim = writable_file_->Truncate(filesize_); Status interim = writable_file_->Truncate(filesize_);
if (!interim.ok() && s.ok()) { if (!interim.ok() && s.ok()) {
@ -151,17 +150,18 @@ Status WritableFileWriter::Close() {
return s; return s;
} }
// write out the cached data to the OS cache // write out the cached data to the OS cache or storage if direct I/O
// enabled
Status WritableFileWriter::Flush() { Status WritableFileWriter::Flush() {
Status s; Status s;
TEST_KILL_RANDOM("WritableFileWriter::Flush:0", TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
rocksdb_kill_odds * REDUCE_ODDS2); rocksdb_kill_odds * REDUCE_ODDS2);
if (buf_.CurrentSize() > 0) { if (buf_.CurrentSize() > 0) {
if (use_os_buffer_) { if (direct_io_) {
s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize()); s = WriteDirect();
} else { } else {
s = WriteUnbuffered(); s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
} }
if (!s.ok()) { if (!s.ok()) {
return s; return s;
@ -259,8 +259,8 @@ size_t WritableFileWriter::RequestToken(size_t bytes, bool align) {
if (align) { if (align) {
// Here we may actually require more than burst and block // Here we may actually require more than burst and block
// but we can not write less than one page at a time on unbuffered // but we can not write less than one page at a time on direct I/O
// thus we may want not to use ratelimiter s // thus we may want not to use ratelimiter
size_t alignment = buf_.Alignment(); size_t alignment = buf_.Alignment();
bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes)); bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes));
} }
@ -273,7 +273,7 @@ size_t WritableFileWriter::RequestToken(size_t bytes, bool align) {
// limiter if available // limiter if available
Status WritableFileWriter::WriteBuffered(const char* data, size_t size) { Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
Status s; Status s;
assert(use_os_buffer_); assert(!direct_io_);
const char* src = data; const char* src = data;
size_t left = size; size_t left = size;
@ -308,10 +308,10 @@ Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
// whole number of pages to be written again on the next flush because we can // whole number of pages to be written again on the next flush because we can
// only write on aligned // only write on aligned
// offsets. // offsets.
Status WritableFileWriter::WriteUnbuffered() { Status WritableFileWriter::WriteDirect() {
Status s; Status s;
assert(!use_os_buffer_); assert(direct_io_);
const size_t alignment = buf_.Alignment(); const size_t alignment = buf_.Alignment();
assert((next_write_offset_ % alignment) == 0); assert((next_write_offset_ % alignment) == 0);
@ -339,7 +339,7 @@ Status WritableFileWriter::WriteUnbuffered() {
{ {
IOSTATS_TIMER_GUARD(write_nanos); IOSTATS_TIMER_GUARD(write_nanos);
TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
// Unbuffered writes must be positional // direct writes must be positional
s = writable_file_->PositionedAppend(Slice(src, size), write_offset); s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
if (!s.ok()) { if (!s.ok()) {
buf_.Size(file_advance + leftover_tail); buf_.Size(file_advance + leftover_tail);

@ -103,7 +103,6 @@ class WritableFileWriter {
uint64_t next_write_offset_; uint64_t next_write_offset_;
bool pending_sync_; bool pending_sync_;
const bool direct_io_; const bool direct_io_;
const bool use_os_buffer_;
uint64_t last_sync_size_; uint64_t last_sync_size_;
uint64_t bytes_per_sync_; uint64_t bytes_per_sync_;
RateLimiter* rate_limiter_; RateLimiter* rate_limiter_;
@ -118,7 +117,6 @@ class WritableFileWriter {
next_write_offset_(0), next_write_offset_(0),
pending_sync_(false), pending_sync_(false),
direct_io_(writable_file_->UseDirectIO()), direct_io_(writable_file_->UseDirectIO()),
use_os_buffer_(writable_file_->UseOSBuffer()),
last_sync_size_(0), last_sync_size_(0),
bytes_per_sync_(options.bytes_per_sync), bytes_per_sync_(options.bytes_per_sync),
rate_limiter_(options.rate_limiter) { rate_limiter_(options.rate_limiter) {
@ -156,8 +154,8 @@ class WritableFileWriter {
private: private:
// Used when os buffering is OFF and we are writing // Used when os buffering is OFF and we are writing
// DMA such as in Windows unbuffered mode // DMA such as in Direct I/O mode
Status WriteUnbuffered(); Status WriteDirect();
// Normal write // Normal write
Status WriteBuffered(const char* data, size_t size); Status WriteBuffered(const char* data, size_t size);
Status RangeSync(uint64_t offset, uint64_t nbytes); Status RangeSync(uint64_t offset, uint64_t nbytes);

@ -88,9 +88,9 @@ TEST_F(WritableFileWriterTest, RangeSync) {
TEST_F(WritableFileWriterTest, AppendStatusReturn) { TEST_F(WritableFileWriterTest, AppendStatusReturn) {
class FakeWF : public WritableFile { class FakeWF : public WritableFile {
public: public:
explicit FakeWF() : use_os_buffer_(true), io_error_(false) {} explicit FakeWF() : use_direct_io_(false), io_error_(false) {}
virtual bool UseOSBuffer() const override { return use_os_buffer_; } virtual bool UseDirectIO() const override { return use_direct_io_; }
Status Append(const Slice& data) override { Status Append(const Slice& data) override {
if (io_error_) { if (io_error_) {
return Status::IOError("Fake IO error"); return Status::IOError("Fake IO error");
@ -106,15 +106,15 @@ TEST_F(WritableFileWriterTest, AppendStatusReturn) {
Status Close() override { return Status::OK(); } Status Close() override { return Status::OK(); }
Status Flush() override { return Status::OK(); } Status Flush() override { return Status::OK(); }
Status Sync() override { return Status::OK(); } Status Sync() override { return Status::OK(); }
void SetUseOSBuffer(bool val) { use_os_buffer_ = val; } void SetUseDirectIO(bool val) { use_direct_io_ = val; }
void SetIOError(bool val) { io_error_ = val; } void SetIOError(bool val) { io_error_ = val; }
protected: protected:
bool use_os_buffer_; bool use_direct_io_;
bool io_error_; bool io_error_;
}; };
unique_ptr<FakeWF> wf(new FakeWF()); unique_ptr<FakeWF> wf(new FakeWF());
wf->SetUseOSBuffer(false); wf->SetUseDirectIO(true);
unique_ptr<WritableFileWriter> writer( unique_ptr<WritableFileWriter> writer(
new WritableFileWriter(std::move(wf), EnvOptions())); new WritableFileWriter(std::move(wf), EnvOptions()));

@ -165,7 +165,7 @@ PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* f,
: filename_(fname), : filename_(fname),
file_(f), file_(f),
fd_(fileno(f)), fd_(fileno(f)),
use_os_buffer_(options.use_os_buffer) {} use_direct_io_(options.use_direct_reads) {}
PosixSequentialFile::~PosixSequentialFile() { fclose(file_); } PosixSequentialFile::~PosixSequentialFile() { fclose(file_); }
@ -187,7 +187,7 @@ Status PosixSequentialFile::Read(size_t n, Slice* result, char* scratch) {
s = IOError(filename_, errno); s = IOError(filename_, errno);
} }
} }
if (!use_os_buffer_) { if (use_direct_io_) {
// we need to fadvise away the entire range of pages because // we need to fadvise away the entire range of pages because
// we do not want readahead pages to be cached. // we do not want readahead pages to be cached.
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
@ -294,7 +294,7 @@ size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
*/ */
PosixRandomAccessFile::PosixRandomAccessFile(const std::string& fname, int fd, PosixRandomAccessFile::PosixRandomAccessFile(const std::string& fname, int fd,
const EnvOptions& options) const EnvOptions& options)
: filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) { : filename_(fname), fd_(fd), use_direct_io_(options.use_direct_reads) {
assert(!options.use_mmap_reads || sizeof(void*) < 8); assert(!options.use_mmap_reads || sizeof(void*) < 8);
} }
@ -325,7 +325,8 @@ Status PosixRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
// An error: return a non-ok status // An error: return a non-ok status
s = IOError(filename_, errno); s = IOError(filename_, errno);
} }
if (!use_os_buffer_) {
if (use_direct_io_) {
// we need to fadvise away the entire range of pages because // we need to fadvise away the entire range of pages because
// we do not want readahead pages to be cached. // we do not want readahead pages to be cached.
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
@ -397,7 +398,7 @@ PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
: fd_(fd), filename_(fname), mmapped_region_(base), length_(length) { : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
fd_ = fd_ + 0; // suppress the warning for used variables fd_ = fd_ + 0; // suppress the warning for used variables
assert(options.use_mmap_reads); assert(options.use_mmap_reads);
assert(options.use_os_buffer); assert(!options.use_direct_reads);
} }
PosixMmapReadableFile::~PosixMmapReadableFile() { PosixMmapReadableFile::~PosixMmapReadableFile() {
@ -533,6 +534,7 @@ PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
#endif #endif
assert((page_size & (page_size - 1)) == 0); assert((page_size & (page_size - 1)) == 0);
assert(options.use_mmap_writes); assert(options.use_mmap_writes);
assert(!options.use_direct_writes);
} }
PosixMmapFile::~PosixMmapFile() { PosixMmapFile::~PosixMmapFile() {
@ -665,7 +667,10 @@ Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) {
*/ */
PosixWritableFile::PosixWritableFile(const std::string& fname, int fd, PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
const EnvOptions& options) const EnvOptions& options)
: filename_(fname), fd_(fd), filesize_(0) { : filename_(fname),
direct_io_(options.use_direct_writes),
fd_(fd),
filesize_(0) {
#ifdef ROCKSDB_FALLOCATE_PRESENT #ifdef ROCKSDB_FALLOCATE_PRESENT
allow_fallocate_ = options.allow_fallocate; allow_fallocate_ = options.allow_fallocate;
fallocate_with_keep_size_ = options.fallocate_with_keep_size; fallocate_with_keep_size_ = options.fallocate_with_keep_size;
@ -680,6 +685,7 @@ PosixWritableFile::~PosixWritableFile() {
} }
Status PosixWritableFile::Append(const Slice& data) { Status PosixWritableFile::Append(const Slice& data) {
assert(!direct_io_|| (IsSectorAligned(data.size()) && IsPageAligned(data.data())));
const char* src = data.data(); const char* src = data.data();
size_t left = data.size(); size_t left = data.size();
while (left != 0) { while (left != 0) {
@ -698,6 +704,8 @@ Status PosixWritableFile::Append(const Slice& data) {
} }
Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
assert(direct_io_ && IsSectorAligned(offset) &&
IsSectorAligned(data.size()) && IsPageAligned(data.data()));
assert(offset <= std::numeric_limits<off_t>::max()); assert(offset <= std::numeric_limits<off_t>::max());
const char* src = data.data(); const char* src = data.data();
size_t left = data.size(); size_t left = data.size();
@ -713,7 +721,7 @@ Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
offset += done; offset += done;
src += done; src += done;
} }
filesize_ = offset + data.size(); filesize_ = offset;
return Status::OK(); return Status::OK();
} }
@ -778,6 +786,9 @@ bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
uint64_t PosixWritableFile::GetFileSize() { return filesize_; } uint64_t PosixWritableFile::GetFileSize() { return filesize_; }
Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) { Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
if (direct_io_) {
return Status::OK();
}
#ifndef OS_LINUX #ifndef OS_LINUX
return Status::OK(); return Status::OK();
#else #else
@ -825,29 +836,6 @@ size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
} }
#endif #endif
/*
* PosixDirectIOWritableFile
*/
Status PosixDirectIOWritableFile::Append(const Slice& data) {
assert(IsSectorAligned(data.size()) && IsPageAligned(data.data()));
if (!IsSectorAligned(data.size()) || !IsPageAligned(data.data())) {
return Status::IOError("Unaligned buffer for direct IO");
}
return PosixWritableFile::Append(data);
}
Status PosixDirectIOWritableFile::PositionedAppend(const Slice& data,
uint64_t offset) {
assert(IsSectorAligned(offset));
assert(IsSectorAligned(data.size()));
assert(IsPageAligned(data.data()));
if (!IsSectorAligned(offset) || !IsSectorAligned(data.size()) ||
!IsPageAligned(data.data())) {
return Status::IOError("offset or size is not aligned");
}
return PosixWritableFile::PositionedAppend(data, offset);
}
/* /*
* PosixRandomRWFile * PosixRandomRWFile
*/ */

@ -41,7 +41,7 @@ class PosixSequentialFile : public SequentialFile {
std::string filename_; std::string filename_;
FILE* file_; FILE* file_;
int fd_; int fd_;
bool use_os_buffer_; bool use_direct_io_;
public: public:
PosixSequentialFile(const std::string& fname, FILE* f, PosixSequentialFile(const std::string& fname, FILE* f,
@ -74,7 +74,7 @@ class PosixRandomAccessFile : public RandomAccessFile {
protected: protected:
std::string filename_; std::string filename_;
int fd_; int fd_;
bool use_os_buffer_; bool use_direct_io_;
public: public:
PosixRandomAccessFile(const std::string& fname, int fd, PosixRandomAccessFile(const std::string& fname, int fd,
@ -108,6 +108,7 @@ class PosixDirectIORandomAccessFile : public PosixRandomAccessFile {
class PosixWritableFile : public WritableFile { class PosixWritableFile : public WritableFile {
protected: protected:
const std::string filename_; const std::string filename_;
const bool direct_io_;
int fd_; int fd_;
uint64_t filesize_; uint64_t filesize_;
#ifdef ROCKSDB_FALLOCATE_PRESENT #ifdef ROCKSDB_FALLOCATE_PRESENT
@ -130,7 +131,13 @@ class PosixWritableFile : public WritableFile {
virtual Status Sync() override; virtual Status Sync() override;
virtual Status Fsync() override; virtual Status Fsync() override;
virtual bool IsSyncThreadSafe() const override; virtual bool IsSyncThreadSafe() const override;
virtual bool UseDirectIO() const override { return direct_io_; }
virtual uint64_t GetFileSize() override; virtual uint64_t GetFileSize() override;
virtual size_t GetRequiredBufferAlignment() const override {
// TODO(gzh): It should be the logical sector size/filesystem block size
// hardcoded as 4k for most cases
return 4 * 1024;
}
virtual Status InvalidateCache(size_t offset, size_t length) override; virtual Status InvalidateCache(size_t offset, size_t length) override;
#ifdef ROCKSDB_FALLOCATE_PRESENT #ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(uint64_t offset, uint64_t len) override; virtual Status Allocate(uint64_t offset, uint64_t len) override;
@ -139,22 +146,7 @@ class PosixWritableFile : public WritableFile {
#endif #endif
}; };
class PosixDirectIOWritableFile : public PosixWritableFile { // mmap() based random-access
public:
explicit PosixDirectIOWritableFile(const std::string& filename, int fd)
: PosixWritableFile(filename, fd, EnvOptions()) {}
virtual ~PosixDirectIOWritableFile() {}
bool UseOSBuffer() const override { return false; }
size_t GetRequiredBufferAlignment() const override { return 4 * 1024; }
Status Append(const Slice& data) override;
Status PositionedAppend(const Slice& data, uint64_t offset) override;
bool UseDirectIO() const override { return true; }
Status InvalidateCache(size_t offset, size_t length) override {
return Status::OK();
}
};
class PosixMmapReadableFile : public RandomAccessFile { class PosixMmapReadableFile : public RandomAccessFile {
private: private:
int fd_; int fd_;

@ -34,8 +34,7 @@ namespace rocksdb {
void RunBenchmark() { void RunBenchmark() {
std::string file_name = test::TmpDir() + "/log_write_benchmark.log"; std::string file_name = test::TmpDir() + "/log_write_benchmark.log";
Env* env = Env::Default(); Env* env = Env::Default();
EnvOptions env_options; EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions());
env_options.use_mmap_writes = false;
env_options.bytes_per_sync = FLAGS_bytes_per_sync; env_options.bytes_per_sync = FLAGS_bytes_per_sync;
unique_ptr<WritableFile> file; unique_ptr<WritableFile> file;
env->NewWritableFile(file_name, &file, env_options); env->NewWritableFile(file_name, &file, env_options);

@ -221,9 +221,7 @@ class RandomAccessFileImpl : public RandomAccessFile {
class WritableFileImpl : public WritableFile { class WritableFileImpl : public WritableFile {
public: public:
WritableFileImpl(FileState* file) : file_(file) { explicit WritableFileImpl(FileState* file) : file_(file) { file_->Ref(); }
file_->Ref();
}
~WritableFileImpl() { ~WritableFileImpl() {
file_->Unref(); file_->Unref();

@ -440,8 +440,8 @@ Status MockEnv::NewRandomAccessFile(const std::string& fname,
} }
Status MockEnv::NewWritableFile(const std::string& fname, Status MockEnv::NewWritableFile(const std::string& fname,
unique_ptr<WritableFile>* result, unique_ptr<WritableFile>* result,
const EnvOptions& env_options) { const EnvOptions& env_options) {
auto fn = NormalizePath(fname); auto fn = NormalizePath(fname);
MutexLock lock(&mutex_); MutexLock lock(&mutex_);
if (file_map_.find(fn) != file_map_.end()) { if (file_map_.find(fn) != file_map_.end()) {

@ -198,10 +198,10 @@ DBOptions::DBOptions()
WAL_ttl_seconds(0), WAL_ttl_seconds(0),
WAL_size_limit_MB(0), WAL_size_limit_MB(0),
manifest_preallocation_size(4 * 1024 * 1024), manifest_preallocation_size(4 * 1024 * 1024),
allow_os_buffer(true),
allow_mmap_reads(false), allow_mmap_reads(false),
allow_mmap_writes(false), allow_mmap_writes(false),
use_direct_reads(false), use_direct_reads(false),
use_direct_writes(false),
allow_fallocate(true), allow_fallocate(true),
is_fd_close_on_exec(true), is_fd_close_on_exec(true),
skip_log_error_on_recovery(false), skip_log_error_on_recovery(false),
@ -269,10 +269,10 @@ DBOptions::DBOptions(const Options& options)
WAL_ttl_seconds(options.WAL_ttl_seconds), WAL_ttl_seconds(options.WAL_ttl_seconds),
WAL_size_limit_MB(options.WAL_size_limit_MB), WAL_size_limit_MB(options.WAL_size_limit_MB),
manifest_preallocation_size(options.manifest_preallocation_size), manifest_preallocation_size(options.manifest_preallocation_size),
allow_os_buffer(options.allow_os_buffer),
allow_mmap_reads(options.allow_mmap_reads), allow_mmap_reads(options.allow_mmap_reads),
allow_mmap_writes(options.allow_mmap_writes), allow_mmap_writes(options.allow_mmap_writes),
use_direct_reads(options.use_direct_reads), use_direct_reads(options.use_direct_reads),
use_direct_writes(options.use_direct_writes),
allow_fallocate(options.allow_fallocate), allow_fallocate(options.allow_fallocate),
is_fd_close_on_exec(options.is_fd_close_on_exec), is_fd_close_on_exec(options.is_fd_close_on_exec),
skip_log_error_on_recovery(options.skip_log_error_on_recovery), skip_log_error_on_recovery(options.skip_log_error_on_recovery),
@ -336,11 +336,11 @@ void DBOptions::Dump(Logger* log) const {
keep_log_file_num); keep_log_file_num);
Header(log, " Options.recycle_log_file_num: %" ROCKSDB_PRIszt, Header(log, " Options.recycle_log_file_num: %" ROCKSDB_PRIszt,
recycle_log_file_num); recycle_log_file_num);
Header(log, " Options.allow_os_buffer: %d", allow_os_buffer);
Header(log, " Options.allow_mmap_reads: %d", allow_mmap_reads);
Header(log, " Options.allow_fallocate: %d", allow_fallocate); Header(log, " Options.allow_fallocate: %d", allow_fallocate);
Header(log, " Options.allow_mmap_reads: %d", allow_mmap_reads);
Header(log, " Options.allow_mmap_writes: %d", allow_mmap_writes); Header(log, " Options.allow_mmap_writes: %d", allow_mmap_writes);
Header(log, " Options.use_direct_reads: %d", use_direct_reads); Header(log, " Options.use_direct_reads: %d", use_direct_reads);
Header(log, " Options.use_direct_writes: %d", use_direct_writes);
Header(log, " Options.create_missing_column_families: %d", Header(log, " Options.create_missing_column_families: %d",
create_missing_column_families); create_missing_column_families);
Header(log, " Options.db_log_dir: %s", Header(log, " Options.db_log_dir: %s",
@ -366,12 +366,6 @@ void DBOptions::Dump(Logger* log) const {
Header(log, Header(log,
" Options.manifest_preallocation_size: %" ROCKSDB_PRIszt, " Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
manifest_preallocation_size); manifest_preallocation_size);
Header(log, " Options.allow_os_buffer: %d",
allow_os_buffer);
Header(log, " Options.allow_mmap_reads: %d",
allow_mmap_reads);
Header(log, " Options.allow_mmap_writes: %d",
allow_mmap_writes);
Header(log, " Options.is_fd_close_on_exec: %d", Header(log, " Options.is_fd_close_on_exec: %d",
is_fd_close_on_exec); is_fd_close_on_exec);
Header(log, " Options.stats_dump_period_sec: %u", Header(log, " Options.stats_dump_period_sec: %u",

@ -71,10 +71,10 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
options.WAL_size_limit_MB = immutable_db_options.wal_size_limit_mb; options.WAL_size_limit_MB = immutable_db_options.wal_size_limit_mb;
options.manifest_preallocation_size = options.manifest_preallocation_size =
immutable_db_options.manifest_preallocation_size; immutable_db_options.manifest_preallocation_size;
options.allow_os_buffer = immutable_db_options.allow_os_buffer;
options.allow_mmap_reads = immutable_db_options.allow_mmap_reads; options.allow_mmap_reads = immutable_db_options.allow_mmap_reads;
options.allow_mmap_writes = immutable_db_options.allow_mmap_writes; options.allow_mmap_writes = immutable_db_options.allow_mmap_writes;
options.use_direct_reads = immutable_db_options.use_direct_reads; options.use_direct_reads = immutable_db_options.use_direct_reads;
options.use_direct_writes = immutable_db_options.use_direct_writes;
options.allow_fallocate = immutable_db_options.allow_fallocate; options.allow_fallocate = immutable_db_options.allow_fallocate;
options.is_fd_close_on_exec = immutable_db_options.is_fd_close_on_exec; options.is_fd_close_on_exec = immutable_db_options.is_fd_close_on_exec;
options.stats_dump_period_sec = immutable_db_options.stats_dump_period_sec; options.stats_dump_period_sec = immutable_db_options.stats_dump_period_sec;

@ -186,12 +186,14 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
{"use_direct_reads", {"use_direct_reads",
{offsetof(struct DBOptions, use_direct_reads), OptionType::kBoolean, {offsetof(struct DBOptions, use_direct_reads), OptionType::kBoolean,
OptionVerificationType::kNormal, false, 0}}, OptionVerificationType::kNormal, false, 0}},
{"use_direct_writes",
{offsetof(struct DBOptions, use_direct_writes), OptionType::kBoolean,
OptionVerificationType::kNormal, false, 0}},
{"allow_2pc", {"allow_2pc",
{offsetof(struct DBOptions, allow_2pc), OptionType::kBoolean, {offsetof(struct DBOptions, allow_2pc), OptionType::kBoolean,
OptionVerificationType::kNormal, false, 0}}, OptionVerificationType::kNormal, false, 0}},
{"allow_os_buffer", {"allow_os_buffer",
{offsetof(struct DBOptions, allow_os_buffer), OptionType::kBoolean, {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true, 0}},
OptionVerificationType::kNormal, false, 0}},
{"create_if_missing", {"create_if_missing",
{offsetof(struct DBOptions, create_if_missing), OptionType::kBoolean, {offsetof(struct DBOptions, create_if_missing), OptionType::kBoolean,
OptionVerificationType::kNormal, false, 0}}, OptionVerificationType::kNormal, false, 0}},

@ -267,7 +267,6 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
"max_background_flushes=35;" "max_background_flushes=35;"
"create_if_missing=false;" "create_if_missing=false;"
"error_if_exists=true;" "error_if_exists=true;"
"allow_os_buffer=false;"
"delayed_write_rate=4294976214;" "delayed_write_rate=4294976214;"
"manifest_preallocation_size=1222;" "manifest_preallocation_size=1222;"
"allow_mmap_writes=false;" "allow_mmap_writes=false;"
@ -275,6 +274,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
"allow_fallocate=true;" "allow_fallocate=true;"
"allow_mmap_reads=false;" "allow_mmap_reads=false;"
"use_direct_reads=false;" "use_direct_reads=false;"
"use_direct_writes=false;"
"max_log_file_size=4607;" "max_log_file_size=4607;"
"random_access_max_buffer_size=1048576;" "random_access_max_buffer_size=1048576;"
"advise_random_on_open=true;" "advise_random_on_open=true;"

@ -116,9 +116,10 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
{"WAL_ttl_seconds", "43"}, {"WAL_ttl_seconds", "43"},
{"WAL_size_limit_MB", "44"}, {"WAL_size_limit_MB", "44"},
{"manifest_preallocation_size", "45"}, {"manifest_preallocation_size", "45"},
{"allow_os_buffer", "false"},
{"allow_mmap_reads", "true"}, {"allow_mmap_reads", "true"},
{"allow_mmap_writes", "false"}, {"allow_mmap_writes", "false"},
{"use_direct_reads", "false"},
{"use_direct_writes", "false"},
{"is_fd_close_on_exec", "true"}, {"is_fd_close_on_exec", "true"},
{"skip_log_error_on_recovery", "false"}, {"skip_log_error_on_recovery", "false"},
{"stats_dump_period_sec", "46"}, {"stats_dump_period_sec", "46"},
@ -231,9 +232,10 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43)); ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44)); ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
ASSERT_EQ(new_db_opt.manifest_preallocation_size, 45U); ASSERT_EQ(new_db_opt.manifest_preallocation_size, 45U);
ASSERT_EQ(new_db_opt.allow_os_buffer, false);
ASSERT_EQ(new_db_opt.allow_mmap_reads, true); ASSERT_EQ(new_db_opt.allow_mmap_reads, true);
ASSERT_EQ(new_db_opt.allow_mmap_writes, false); ASSERT_EQ(new_db_opt.allow_mmap_writes, false);
ASSERT_EQ(new_db_opt.use_direct_reads, false);
ASSERT_EQ(new_db_opt.use_direct_writes, false);
ASSERT_EQ(new_db_opt.is_fd_close_on_exec, true); ASSERT_EQ(new_db_opt.is_fd_close_on_exec, true);
ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false); ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false);
ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U); ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U);

@ -240,7 +240,8 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
db_opt->advise_random_on_open = rnd->Uniform(2); db_opt->advise_random_on_open = rnd->Uniform(2);
db_opt->allow_mmap_reads = rnd->Uniform(2); db_opt->allow_mmap_reads = rnd->Uniform(2);
db_opt->allow_mmap_writes = rnd->Uniform(2); db_opt->allow_mmap_writes = rnd->Uniform(2);
db_opt->allow_os_buffer = rnd->Uniform(2); db_opt->use_direct_reads = rnd->Uniform(2);
db_opt->use_direct_writes = rnd->Uniform(2);
db_opt->create_if_missing = rnd->Uniform(2); db_opt->create_if_missing = rnd->Uniform(2);
db_opt->create_missing_column_families = rnd->Uniform(2); db_opt->create_missing_column_families = rnd->Uniform(2);
db_opt->disableDataSync = rnd->Uniform(2); db_opt->disableDataSync = rnd->Uniform(2);

@ -1162,7 +1162,7 @@ Status BackupEngineImpl::CopyOrCreateFile(
unique_ptr<SequentialFile> src_file; unique_ptr<SequentialFile> src_file;
EnvOptions env_options; EnvOptions env_options;
env_options.use_mmap_writes = false; env_options.use_mmap_writes = false;
env_options.use_os_buffer = false; // TODO:(gzh) maybe use direct writes here if possible
if (size != nullptr) { if (size != nullptr) {
*size = 0; *size = 0;
} }
@ -1365,7 +1365,7 @@ Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env,
EnvOptions env_options; EnvOptions env_options;
env_options.use_mmap_writes = false; env_options.use_mmap_writes = false;
env_options.use_os_buffer = false; env_options.use_direct_reads = false;
std::unique_ptr<SequentialFile> src_file; std::unique_ptr<SequentialFile> src_file;
Status s = src_env->NewSequentialFile(src, &src_file, env_options); Status s = src_env->NewSequentialFile(src, &src_file, env_options);
@ -1671,6 +1671,7 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
unique_ptr<WritableFile> backup_meta_file; unique_ptr<WritableFile> backup_meta_file;
EnvOptions env_options; EnvOptions env_options;
env_options.use_mmap_writes = false; env_options.use_mmap_writes = false;
env_options.use_direct_writes = false;
s = env_->NewWritableFile(meta_filename_ + ".tmp", &backup_meta_file, s = env_->NewWritableFile(meta_filename_ + ".tmp", &backup_meta_file,
env_options); env_options);
if (!s.ok()) { if (!s.ok()) {

@ -18,7 +18,7 @@ class SequentialFileMirror : public SequentialFile {
public: public:
unique_ptr<SequentialFile> a_, b_; unique_ptr<SequentialFile> a_, b_;
std::string fname; std::string fname;
SequentialFileMirror(std::string f) : fname(f) {} explicit SequentialFileMirror(std::string f) : fname(f) {}
Status Read(size_t n, Slice* result, char* scratch) { Status Read(size_t n, Slice* result, char* scratch) {
Slice aslice; Slice aslice;
@ -62,7 +62,7 @@ class RandomAccessFileMirror : public RandomAccessFile {
public: public:
unique_ptr<RandomAccessFile> a_, b_; unique_ptr<RandomAccessFile> a_, b_;
std::string fname; std::string fname;
RandomAccessFileMirror(std::string f) : fname(f) {} explicit RandomAccessFileMirror(std::string f) : fname(f) {}
Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
Status as = a_->Read(offset, n, result, scratch); Status as = a_->Read(offset, n, result, scratch);
@ -101,7 +101,7 @@ class WritableFileMirror : public WritableFile {
public: public:
unique_ptr<WritableFile> a_, b_; unique_ptr<WritableFile> a_, b_;
std::string fname; std::string fname;
WritableFileMirror(std::string f) : fname(f) {} explicit WritableFileMirror(std::string f) : fname(f) {}
Status Append(const Slice& data) override { Status Append(const Slice& data) override {
Status as = a_->Append(data); Status as = a_->Append(data);

Loading…
Cancel
Save