Implement PositionedAppend for PosixWritableFile

Summary:
This patch clarifies the contract of PositionedAppend with some unit
tests and also implements it for PosixWritableFile. (Tasks: 14524071)
Closes https://github.com/facebook/rocksdb/pull/1514

Differential Revision: D4204907

Pulled By: maysamyabandeh

fbshipit-source-id: 06eabd2
main
Maysam Yabandeh 8 years ago committed by Facebook Github Bot
parent 3f62215210
commit 9d60151b04
  1. 25
      include/rocksdb/env.h
  2. 9
      util/env_posix.cc
  3. 33
      util/env_test.cc
  4. 20
      util/io_posix.cc
  5. 1
      util/io_posix.h

@ -518,10 +518,31 @@ class WritableFile {
return c_DefaultPageSize; return c_DefaultPageSize;
} }
// Append data to the end of the file
// Note: A WriteabelFile object must support either Append or
// PositionedAppend, so the users cannot mix the two.
virtual Status Append(const Slice& data) = 0; virtual Status Append(const Slice& data) = 0;
// Positioned write for unbuffered access default forward // PositionedAppend data to the specified offset. The new EOF after append
// to simple append as most of the tests are buffered by default // must be larger than the previous EOF. This is to be used when writes are
// not backed by OS buffers and hence has to always start from the start of
// the sector. The implementation thus needs to also rewrite the last
// partial sector.
// Note: PositionAppend does not guarantee moving the file offset after the
// write. A WriteabelFile object must support either Append or
// PositionedAppend, so the users cannot mix the two.
//
// PositionedAppend() can only happen on the page/sector boundaries. For that
// reason, if the last write was an incomplete sector we still need to rewind
// back to the nearest sector/page and rewrite the portion of it with whatever
// we need to add. We need to keep where we stop writing.
//
// PositionedAppend() can only write whole sectors. For that reason we have to
// pad with zeros for the last write and trim the file when closing according
// to the position we keep in the previous step.
//
// PositionedAppend() requires aligned buffer to be passed in. The alignment
// required is queried via GetRequiredBufferAlignment()
virtual Status PositionedAppend(const Slice& /* data */, uint64_t /* offset */) { virtual Status PositionedAppend(const Slice& /* data */, uint64_t /* offset */) {
return Status::NotSupported(); return Status::NotSupported();
} }

@ -272,7 +272,14 @@ class PosixEnv : public Env {
#ifdef OS_MACOSX #ifdef OS_MACOSX
int flags = O_WRONLY | O_APPEND | O_TRUNC | O_CREAT; int flags = O_WRONLY | O_APPEND | O_TRUNC | O_CREAT;
#else #else
int flags = O_WRONLY | O_APPEND | O_TRUNC | O_CREAT | O_DIRECT; // Note: we should avoid O_APPEND here due to ta the following bug:
// POSIX requires that opening a file with the O_APPEND flag should
// have no affect on the location at which pwrite() writes data.
// However, on Linux, if a file is opened with O_APPEND, pwrite()
// appends data to the end of the file, regardless of the value of
// offset.
// More info here: https://linux.die.net/man/2/pwrite
int flags = O_WRONLY | O_TRUNC | O_CREAT | O_DIRECT;
#endif #endif
TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags); TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
fd = open(fname.c_str(), flags, 0644); fd = open(fname.c_str(), flags, 0644);

@ -640,6 +640,39 @@ class IoctlFriendlyTmpdir {
std::string dir_; std::string dir_;
}; };
TEST_F(EnvPosixTest, PositionedAppend) {
unique_ptr<WritableFile> writable_file;
EnvOptions options;
options.use_direct_writes = true;
options.use_mmap_writes = false;
IoctlFriendlyTmpdir ift;
ASSERT_OK(env_->NewWritableFile(ift.name() + "/f", &writable_file, options));
const size_t kBlockSize = 512;
const size_t kPageSize = 4096;
const size_t kDataSize = kPageSize;
// Write a page worth of 'a'
auto data_ptr = NewAligned(kDataSize, 'a');
Slice data_a(data_ptr.get(), kDataSize);
ASSERT_OK(writable_file->PositionedAppend(data_a, 0U));
// Write a page worth of 'b' right after the first sector
data_ptr = NewAligned(kDataSize, 'b');
Slice data_b(data_ptr.get(), kDataSize);
ASSERT_OK(writable_file->PositionedAppend(data_b, kBlockSize));
ASSERT_OK(writable_file->Close());
// The file now has 1 sector worth of a followed by a page worth of b
// Verify the above
unique_ptr<SequentialFile> seq_file;
ASSERT_OK(env_->NewSequentialFile(ift.name() + "/f", &seq_file, options));
char scratch[kPageSize * 2];
Slice result;
ASSERT_OK(seq_file->Read(sizeof(scratch), &result, scratch));
ASSERT_EQ(kPageSize + kBlockSize, result.size());
ASSERT_EQ('a', result[kBlockSize - 1]);
ASSERT_EQ('b', result[kBlockSize]);
}
// Only works in linux platforms // Only works in linux platforms
TEST_F(EnvPosixTest, RandomAccessUniqueID) { TEST_F(EnvPosixTest, RandomAccessUniqueID) {

@ -692,6 +692,26 @@ Status PosixWritableFile::Append(const Slice& data) {
return Status::OK(); return Status::OK();
} }
Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
assert(offset <= std::numeric_limits<off_t>::max());
const char* src = data.data();
size_t left = data.size();
while (left != 0) {
ssize_t done = pwrite(fd_, src, left, static_cast<off_t>(offset));
if (done < 0) {
if (errno == EINTR) {
continue;
}
return IOError(filename_, errno);
}
left -= done;
offset += done;
src += done;
}
filesize_ = offset + data.size();
return Status::OK();
}
Status PosixWritableFile::Close() { Status PosixWritableFile::Close() {
Status s; Status s;

@ -125,6 +125,7 @@ class PosixWritableFile : public WritableFile {
virtual Status Truncate(uint64_t size) override { return Status::OK(); } virtual Status Truncate(uint64_t size) override { return Status::OK(); }
virtual Status Close() override; virtual Status Close() override;
virtual Status Append(const Slice& data) override; virtual Status Append(const Slice& data) override;
virtual Status PositionedAppend(const Slice& data, uint64_t offset) override;
virtual Status Flush() override; virtual Status Flush() override;
virtual Status Sync() override; virtual Status Sync() override;
virtual Status Fsync() override; virtual Status Fsync() override;

Loading…
Cancel
Save