From 9d60151b04077b76dfc32c5c210ab9929066a6c7 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Fri, 18 Nov 2016 17:06:37 -0800 Subject: [PATCH] Implement PositionedAppend for PosixWritableFile Summary: This patch clarifies the contract of PositionedAppend with some unit tests and also implements it for PosixWritableFile. (Tasks: 14524071) Closes https://github.com/facebook/rocksdb/pull/1514 Differential Revision: D4204907 Pulled By: maysamyabandeh fbshipit-source-id: 06eabd2 --- include/rocksdb/env.h | 25 +++++++++++++++++++++++-- util/env_posix.cc | 9 ++++++++- util/env_test.cc | 33 +++++++++++++++++++++++++++++++++ util/io_posix.cc | 20 ++++++++++++++++++++ util/io_posix.h | 1 + 5 files changed, 85 insertions(+), 3 deletions(-) diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index e965e7ce6..5a1e425a9 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -518,10 +518,31 @@ class WritableFile { return c_DefaultPageSize; } + // Append data to the end of the file + // Note: A WriteabelFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. virtual Status Append(const Slice& data) = 0; - // Positioned write for unbuffered access default forward - // to simple append as most of the tests are buffered by default + // PositionedAppend data to the specified offset. The new EOF after append + // must be larger than the previous EOF. This is to be used when writes are + // not backed by OS buffers and hence has to always start from the start of + // the sector. The implementation thus needs to also rewrite the last + // partial sector. + // Note: PositionAppend does not guarantee moving the file offset after the + // write. A WriteabelFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. + // + // PositionedAppend() can only happen on the page/sector boundaries. For that + // reason, if the last write was an incomplete sector we still need to rewind + // back to the nearest sector/page and rewrite the portion of it with whatever + // we need to add. We need to keep where we stop writing. + // + // PositionedAppend() can only write whole sectors. For that reason we have to + // pad with zeros for the last write and trim the file when closing according + // to the position we keep in the previous step. + // + // PositionedAppend() requires aligned buffer to be passed in. The alignment + // required is queried via GetRequiredBufferAlignment() virtual Status PositionedAppend(const Slice& /* data */, uint64_t /* offset */) { return Status::NotSupported(); } diff --git a/util/env_posix.cc b/util/env_posix.cc index 056ccc855..01f4dc515 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -272,7 +272,14 @@ class PosixEnv : public Env { #ifdef OS_MACOSX int flags = O_WRONLY | O_APPEND | O_TRUNC | O_CREAT; #else - int flags = O_WRONLY | O_APPEND | O_TRUNC | O_CREAT | O_DIRECT; + // Note: we should avoid O_APPEND here due to ta the following bug: + // POSIX requires that opening a file with the O_APPEND flag should + // have no affect on the location at which pwrite() writes data. + // However, on Linux, if a file is opened with O_APPEND, pwrite() + // appends data to the end of the file, regardless of the value of + // offset. + // More info here: https://linux.die.net/man/2/pwrite + int flags = O_WRONLY | O_TRUNC | O_CREAT | O_DIRECT; #endif TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags); fd = open(fname.c_str(), flags, 0644); diff --git a/util/env_test.cc b/util/env_test.cc index 2819d496b..8c40fb80a 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -640,6 +640,39 @@ class IoctlFriendlyTmpdir { std::string dir_; }; +TEST_F(EnvPosixTest, PositionedAppend) { + unique_ptr writable_file; + + EnvOptions options; + options.use_direct_writes = true; + options.use_mmap_writes = false; + IoctlFriendlyTmpdir ift; + ASSERT_OK(env_->NewWritableFile(ift.name() + "/f", &writable_file, options)); + + const size_t kBlockSize = 512; + const size_t kPageSize = 4096; + const size_t kDataSize = kPageSize; + // Write a page worth of 'a' + auto data_ptr = NewAligned(kDataSize, 'a'); + Slice data_a(data_ptr.get(), kDataSize); + ASSERT_OK(writable_file->PositionedAppend(data_a, 0U)); + // Write a page worth of 'b' right after the first sector + data_ptr = NewAligned(kDataSize, 'b'); + Slice data_b(data_ptr.get(), kDataSize); + ASSERT_OK(writable_file->PositionedAppend(data_b, kBlockSize)); + ASSERT_OK(writable_file->Close()); + // The file now has 1 sector worth of a followed by a page worth of b + + // Verify the above + unique_ptr seq_file; + ASSERT_OK(env_->NewSequentialFile(ift.name() + "/f", &seq_file, options)); + char scratch[kPageSize * 2]; + Slice result; + ASSERT_OK(seq_file->Read(sizeof(scratch), &result, scratch)); + ASSERT_EQ(kPageSize + kBlockSize, result.size()); + ASSERT_EQ('a', result[kBlockSize - 1]); + ASSERT_EQ('b', result[kBlockSize]); +} // Only works in linux platforms TEST_F(EnvPosixTest, RandomAccessUniqueID) { diff --git a/util/io_posix.cc b/util/io_posix.cc index 7892397a0..426d2c560 100644 --- a/util/io_posix.cc +++ b/util/io_posix.cc @@ -692,6 +692,26 @@ Status PosixWritableFile::Append(const Slice& data) { return Status::OK(); } +Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { + assert(offset <= std::numeric_limits::max()); + const char* src = data.data(); + size_t left = data.size(); + while (left != 0) { + ssize_t done = pwrite(fd_, src, left, static_cast(offset)); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return IOError(filename_, errno); + } + left -= done; + offset += done; + src += done; + } + filesize_ = offset + data.size(); + return Status::OK(); +} + Status PosixWritableFile::Close() { Status s; diff --git a/util/io_posix.h b/util/io_posix.h index bc7f0c01c..ac0149c9c 100644 --- a/util/io_posix.h +++ b/util/io_posix.h @@ -125,6 +125,7 @@ class PosixWritableFile : public WritableFile { virtual Status Truncate(uint64_t size) override { return Status::OK(); } virtual Status Close() override; virtual Status Append(const Slice& data) override; + virtual Status PositionedAppend(const Slice& data, uint64_t offset) override; virtual Status Flush() override; virtual Status Sync() override; virtual Status Fsync() override;