Use fallocate to prevent excessive allocation of sst files and logs

Summary:
On some filesystems, pre-allocation can be a considerable
amount of space.  xfs in our production environment pre-allocates by
1GB, for instance.  By using fallocate to inform the kernel of our
expected file sizes, we eliminate this wasteage (that isn't recovered
until the file is closed which, in the case of LOG files, can be a
considerable amount of time).

Test Plan:
created an xfs loopback filesystem, mounted with
allocsize=4M, and ran db_stress.  LOG file without this change was 4M,
and with it it was 128k then grew to normal size.

Reviewers: dhruba

Reviewed By: dhruba

CC: adsharma, leveldb

Differential Revision: https://reviews.facebook.net/D7953
main
Chip Turner 12 years ago
parent 2fdf91a4f8
commit 3dafdfb2c4
  1. 11
      db/db_impl.cc
  2. 29
      db/db_test.cc
  3. 5
      db/version_set.h
  4. 53
      include/leveldb/env.h
  5. 6
      include/leveldb/options.h
  6. 18
      util/env_posix.cc
  7. 8
      util/options.cc
  8. 28
      util/posix_logger.h

@ -295,6 +295,7 @@ Status DBImpl::NewDB() {
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
file->SetPreallocationBlockSize(options_.manifest_preallocation_size);
{ {
log::Writer log(std::move(file)); log::Writer log(std::move(file));
std::string record; std::string record;
@ -1380,6 +1381,12 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
// Make the output file // Make the output file
std::string fname = TableFileName(dbname_, file_number); std::string fname = TableFileName(dbname_, file_number);
Status s = env_->NewWritableFile(fname, &compact->outfile); Status s = env_->NewWritableFile(fname, &compact->outfile);
// Over-estimate slightly so we don't end up just barely crossing
// the threshold.
compact->outfile->SetPreallocationBlockSize(
1.1 * versions_->MaxFileSizeForLevel(compact->compaction->level() + 1));
if (s.ok()) { if (s.ok()) {
compact->builder.reset(new TableBuilder(options_, compact->outfile.get(), compact->builder.reset(new TableBuilder(options_, compact->outfile.get(),
compact->compaction->level() + 1)); compact->compaction->level() + 1));
@ -2105,6 +2112,9 @@ Status DBImpl::MakeRoomForWrite(bool force) {
versions_->ReuseFileNumber(new_log_number); versions_->ReuseFileNumber(new_log_number);
break; break;
} }
// Our final size should be less than write_buffer_size
// (compression, etc) but err on the side of caution.
lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size);
logfile_number_ = new_log_number; logfile_number_ = new_log_number;
log_.reset(new log::Writer(std::move(lfile))); log_.reset(new log::Writer(std::move(lfile)));
imm_.Add(mem_); imm_.Add(mem_);
@ -2292,6 +2302,7 @@ Status DB::Open(const Options& options, const std::string& dbname,
s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
&lfile); &lfile);
if (s.ok()) { if (s.ok()) {
lfile->SetPreallocationBlockSize(1.1 * options.write_buffer_size);
edit.SetLogNumber(new_log_number); edit.SetLogNumber(new_log_number);
impl->logfile_number_ = new_log_number; impl->logfile_number_ = new_log_number;
impl->log_.reset(new log::Writer(std::move(lfile))); impl->log_.reset(new log::Writer(std::move(lfile)));

@ -524,6 +524,35 @@ TEST(DBTest, ReadWrite) {
} while (ChangeOptions()); } while (ChangeOptions());
} }
TEST(DBTest, Preallocation) {
const std::string src = dbname_ + "/alloc_test";
unique_ptr<WritableFile> srcfile;
ASSERT_OK(env_->NewWritableFile(src, &srcfile));
srcfile->SetPreallocationBlockSize(1024 * 1024);
// No writes should mean no preallocation
size_t block_size, last_allocated_block;
srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
ASSERT_EQ(last_allocated_block, 0UL);
// Small write should preallocate one block
srcfile->Append("test");
srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
ASSERT_EQ(last_allocated_block, 1UL);
// Write an entire preallocation block, make sure we increased by two.
std::string buf(block_size, ' ');
srcfile->Append(buf);
srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
ASSERT_EQ(last_allocated_block, 2UL);
// Write five more blocks at once, ensure we're where we need to be.
buf = std::string(block_size * 5, ' ');
srcfile->Append(buf);
srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
ASSERT_EQ(last_allocated_block, 7UL);
}
TEST(DBTest, PutDeleteGet) { TEST(DBTest, PutDeleteGet) {
do { do {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));

@ -369,6 +369,9 @@ class VersionSet {
// record results in files_by_size_. The largest files are listed first. // record results in files_by_size_. The largest files are listed first.
void UpdateFilesBySize(Version *v); void UpdateFilesBySize(Version *v);
// Get the max file size in a given level.
uint64_t MaxFileSizeForLevel(int level);
private: private:
class Builder; class Builder;
struct ManifestWriter; struct ManifestWriter;
@ -400,8 +403,6 @@ class VersionSet {
double MaxBytesForLevel(int level); double MaxBytesForLevel(int level);
uint64_t MaxFileSizeForLevel(int level);
int64_t ExpandedCompactionByteSizeLimit(int level); int64_t ExpandedCompactionByteSizeLimit(int level);
int64_t MaxGrandParentOverlapBytes(int level); int64_t MaxGrandParentOverlapBytes(int level);

@ -230,7 +230,8 @@ class RandomAccessFile {
// at a time to the file. // at a time to the file.
class WritableFile { class WritableFile {
public: public:
WritableFile() { } WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) {
}
virtual ~WritableFile(); virtual ~WritableFile();
virtual Status Append(const Slice& data) = 0; virtual Status Append(const Slice& data) = 0;
@ -255,7 +256,57 @@ class WritableFile {
return 0; return 0;
} }
/*
* Get and set the default pre-allocation block size for writes to
* this file. If non-zero, then Allocate will be used to extend the
* underlying storage of a file (generally via fallocate) if the Env
* instance supports it.
*/
void SetPreallocationBlockSize(size_t size) {
preallocation_block_size_ = size;
}
virtual void GetPreallocationStatus(size_t* block_size,
size_t* last_allocated_block) {
*last_allocated_block = last_preallocated_block_;
*block_size = preallocation_block_size_;
}
protected:
// PrepareWrite performs any necessary preparation for a write
// before the write actually occurs. This allows for pre-allocation
// of space on devices where it can result in less file
// fragmentation and/or less waste from over-zealous filesystem
// pre-allocation.
void PrepareWrite(size_t offset, size_t len) {
if (preallocation_block_size_ == 0) {
return;
}
// If this write would cross one or more preallocation blocks,
// determine what the last preallocation block necesessary to
// cover this write would be and Allocate to that point.
const auto block_size = preallocation_block_size_;
size_t new_last_preallocated_block =
(offset + len + block_size - 1) / block_size;
if (new_last_preallocated_block > last_preallocated_block_) {
size_t num_spanned_blocks =
new_last_preallocated_block - last_preallocated_block_;
Allocate(block_size * last_preallocated_block_,
block_size * num_spanned_blocks);
last_preallocated_block_ = new_last_preallocated_block;
}
}
/*
* Pre-allocate space for a file.
*/
virtual Status Allocate(off_t offset, off_t len) {
return Status::OK();
}
private: private:
size_t last_preallocated_block_;
size_t preallocation_block_size_;
// No copying allowed // No copying allowed
WritableFile(const WritableFile&); WritableFile(const WritableFile&);
void operator=(const WritableFile&); void operator=(const WritableFile&);

@ -363,6 +363,12 @@ struct Options {
// deleted. // deleted.
// Default : 0 // Default : 0
uint64_t WAL_ttl_seconds; uint64_t WAL_ttl_seconds;
// Number of bytes to preallocate (via fallocate) the manifest
// files. Default is 4mb, which is reasonable to reduce random IO
// as well as prevent overallocation for mounts that preallocate
// large amounts of data (such as xfs's allocsize option).
size_t manifest_preallocation_size;
}; };
// Options that control read operations // Options that control read operations

@ -232,6 +232,7 @@ class PosixMmapFile : public WritableFile {
virtual Status Append(const Slice& data) { virtual Status Append(const Slice& data) {
const char* src = data.data(); const char* src = data.data();
size_t left = data.size(); size_t left = data.size();
PrepareWrite(GetFileSize(), left);
while (left > 0) { while (left > 0) {
assert(base_ <= dst_); assert(base_ <= dst_);
assert(dst_ <= limit_); assert(dst_ <= limit_);
@ -330,6 +331,14 @@ class PosixMmapFile : public WritableFile {
size_t used = dst_ - base_; size_t used = dst_ - base_;
return file_offset_ + used; return file_offset_ + used;
} }
virtual Status Allocate(off_t offset, off_t len) {
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
return Status::OK();
} else {
return IOError(filename_, errno);
}
}
}; };
// Use posix write to write data to a file. // Use posix write to write data to a file.
@ -371,6 +380,7 @@ class PosixWritableFile : public WritableFile {
pending_sync_ = true; pending_sync_ = true;
pending_fsync_ = true; pending_fsync_ = true;
PrepareWrite(GetFileSize(), left);
// if there is no space in the cache, then flush // if there is no space in the cache, then flush
if (cursize_ + left > capacity_) { if (cursize_ + left > capacity_) {
s = Flush(); s = Flush();
@ -455,6 +465,14 @@ class PosixWritableFile : public WritableFile {
virtual uint64_t GetFileSize() { virtual uint64_t GetFileSize() {
return filesize_; return filesize_;
} }
virtual Status Allocate(off_t offset, off_t len) {
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
return Status::OK();
} else {
return IOError(filename_, errno);
}
}
}; };
static int LockOrUnlock(const std::string& fname, int fd, bool lock) { static int LockOrUnlock(const std::string& fname, int fd, bool lock) {

@ -56,7 +56,9 @@ Options::Options()
compaction_filter_args(NULL), compaction_filter_args(NULL),
CompactionFilter(NULL), CompactionFilter(NULL),
disable_auto_compactions(false), disable_auto_compactions(false),
WAL_ttl_seconds(0){ WAL_ttl_seconds(0),
manifest_preallocation_size(4 * 1024 * 1024) {
} }
void void
@ -144,8 +146,10 @@ Options::Dump(Logger* log) const
CompactionFilter); CompactionFilter);
Log(log," Options.disable_auto_compactions: %d", Log(log," Options.disable_auto_compactions: %d",
disable_auto_compactions); disable_auto_compactions);
Log(log," Options.WAL_ttl_seconds: %ld", Log(log," Options.WAL_ttl_seconds: %ld",
WAL_ttl_seconds); WAL_ttl_seconds);
Log(log," Options.manifest_preallocation_size: %ld",
manifest_preallocation_size);
} // Options::Dump } // Options::Dump
} // namespace leveldb } // namespace leveldb

@ -12,19 +12,25 @@
#include <stdio.h> #include <stdio.h>
#include <sys/time.h> #include <sys/time.h>
#include <time.h> #include <time.h>
#include <fcntl.h>
#include <unistd.h>
#include <linux/falloc.h>
#include "leveldb/env.h" #include "leveldb/env.h"
namespace leveldb { namespace leveldb {
const int kDebugLogChunkSize = 128 * 1024;
class PosixLogger : public Logger { class PosixLogger : public Logger {
private: private:
FILE* file_; FILE* file_;
uint64_t (*gettid_)(); // Return the thread id for the current thread uint64_t (*gettid_)(); // Return the thread id for the current thread
size_t log_size_; size_t log_size_;
int fd_;
public: public:
PosixLogger(FILE* f, uint64_t (*gettid)()) : PosixLogger(FILE* f, uint64_t (*gettid)()) :
file_(f), gettid_(gettid), log_size_(0) { } file_(f), gettid_(gettid), log_size_(0), fd_(fileno(f)) { }
virtual ~PosixLogger() { virtual ~PosixLogger() {
fclose(file_); fclose(file_);
} }
@ -86,9 +92,25 @@ class PosixLogger : public Logger {
} }
assert(p <= limit); assert(p <= limit);
fwrite(base, 1, p - base, file_);
// If this write would cross a boundary of kDebugLogChunkSize
// space, pre-allocate more space to avoid overly large
// allocations from filesystem allocsize options.
const size_t write_size = p - base;
const int last_allocation_chunk =
((kDebugLogChunkSize - 1 + log_size_) / kDebugLogChunkSize);
const int desired_allocation_chunk =
((kDebugLogChunkSize - 1 + log_size_ + write_size) /
kDebugLogChunkSize);
if (last_allocation_chunk != desired_allocation_chunk) {
fallocate(fd_, FALLOC_FL_KEEP_SIZE, 0,
desired_allocation_chunk * kDebugLogChunkSize);
}
fwrite(base, 1, write_size, file_);
fflush(file_); fflush(file_);
log_size_ += (p - base);
log_size_ += write_size;
if (base != buffer) { if (base != buffer) {
delete[] base; delete[] base;

Loading…
Cancel
Save