[RocksDB] [Performance] Allow different posix advice to be applied to the same table file

Summary:
Current posix advice implementation ties up the access pattern hint with the creation of a file.
It is not possible to apply different advice for different access (random get vs compaction read),
without keeping two open files for the same table. This patch extended the RandomeAccessFile interface
to accept new access hint at anytime. Particularly, we are able to set different access hint on the same
table file based on when/how the file is used.
Two options are added to set the access hint, after the file is first opened and after the file is being
compacted.

Test Plan: make check; db_stress; db_bench

Reviewers: dhruba

Reviewed By: dhruba

CC: MarkCallaghan, leveldb

Differential Revision: https://reviews.facebook.net/D10905
main
Haobo Xu 11 years ago
parent 2df65c118c
commit ab8d2f6ab2
  1. 33
      db/db_bench.cc
  2. 11
      db/table_cache.cc
  3. 3
      db/table_cache.h
  4. 13
      db/version_set.cc
  5. 6
      include/leveldb/env.h
  6. 10
      include/leveldb/options.h
  7. 24
      table/table.cc
  8. 5
      table/table.h
  9. 21
      table/two_level_iterator.cc
  10. 6
      table/two_level_iterator.h
  11. 24
      util/env_posix.cc
  12. 12
      util/options.cc

@ -293,6 +293,14 @@ static bool FLAGS_use_mmap_writes;
// Allow readaheads to occur for compactions
static bool FLAGS_use_readahead_compactions;
// Advise random access on table file open
static bool FLAGS_advise_random_on_open =
leveldb::Options().advise_random_on_open;
// Access pattern advice when a file is compacted
static auto FLAGS_compaction_fadvice =
leveldb::Options().access_hint_on_compaction_start;
namespace leveldb {
// Helper for quickly generating random data.
@ -900,6 +908,7 @@ unique_ptr<char []> GenerateKeyFromInt(int v, const char* suffix = "")
}
if (method != nullptr) {
fprintf(stdout, "DB path: [%s]\n", FLAGS_db);
RunBenchmark(num_threads, name, method);
}
}
@ -1138,6 +1147,8 @@ unique_ptr<char []> GenerateKeyFromInt(int v, const char* suffix = "")
options.allow_mmap_reads = FLAGS_use_mmap_reads;
options.allow_mmap_writes = FLAGS_use_mmap_writes;
options.allow_readahead_compactions = FLAGS_use_readahead_compactions;
options.advise_random_on_open = FLAGS_advise_random_on_open;
options.access_hint_on_compaction_start = FLAGS_compaction_fadvice;
Status s;
if(FLAGS_read_only) {
s = DB::OpenForReadOnly(options, FLAGS_db, &db_);
@ -1731,8 +1742,9 @@ int main(int argc, char** argv) {
int n;
long l;
char junk;
char hdfsname[2048];
char buf[2048];
char str[512];
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
} else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
@ -1848,8 +1860,8 @@ int main(int argc, char** argv) {
} else if (sscanf(argv[i], "--get_approx=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_get_approx = n;
} else if (sscanf(argv[i], "--hdfs=%s", hdfsname) == 1) {
FLAGS_env = new leveldb::HdfsEnv(hdfsname);
} else if (sscanf(argv[i], "--hdfs=%s", buf) == 1) {
FLAGS_env = new leveldb::HdfsEnv(buf);
} else if (sscanf(argv[i], "--num_levels=%d%c",
&n, &junk) == 1) {
FLAGS_num_levels = n;
@ -1931,6 +1943,21 @@ int main(int argc, char** argv) {
FLAGS_source_compaction_factor = n;
} else if (sscanf(argv[i], "--wal_ttl=%d%c", &n, &junk) == 1) {
FLAGS_WAL_ttl_seconds = static_cast<uint64_t>(n);
} else if (sscanf(argv[i], "--advise_random_on_open=%d%c", &n, &junk) == 1
&& (n == 0 || n ==1 )) {
FLAGS_advise_random_on_open = n;
} else if (sscanf(argv[i], "--compaction_fadvice=%s", buf) == 1) {
if (!strcasecmp(buf, "NONE"))
FLAGS_compaction_fadvice = leveldb::Options::NONE;
else if (!strcasecmp(buf, "NORMAL"))
FLAGS_compaction_fadvice = leveldb::Options::NORMAL;
else if (!strcasecmp(buf, "SEQUENTIAL"))
FLAGS_compaction_fadvice = leveldb::Options::SEQUENTIAL;
else if (!strcasecmp(buf, "WILLNEED"))
FLAGS_compaction_fadvice = leveldb::Options::WILLNEED;
else {
fprintf(stdout, "Unknown compaction fadvice:%s\n", buf);
}
} else {
fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
exit(1);

@ -54,6 +54,9 @@ Status TableCache::FindTable(const EnvOptions& toptions,
s = env_->NewRandomAccessFile(fname, &file, toptions);
RecordTick(options_->statistics, NO_FILE_OPENS);
if (s.ok()) {
if (options_->advise_random_on_open) {
file->Hint(RandomAccessFile::RANDOM);
}
s = Table::Open(*options_, toptions, std::move(file), file_size, &table);
}
@ -74,7 +77,8 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
const EnvOptions& toptions,
uint64_t file_number,
uint64_t file_size,
Table** tableptr) {
Table** tableptr,
bool for_compaction) {
if (tableptr != nullptr) {
*tableptr = nullptr;
}
@ -92,6 +96,11 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
if (tableptr != nullptr) {
*tableptr = table;
}
if (for_compaction) {
table->SetAccessHintForCompaction();
}
return result;
}

@ -37,7 +37,8 @@ class TableCache {
const EnvOptions& toptions,
uint64_t file_number,
uint64_t file_size,
Table** tableptr = nullptr);
Table** tableptr = nullptr,
bool for_compaction = false);
// If a seek to internal key "k" in specified file finds an entry,
// call (*handle_result)(arg, found_key, found_value) repeatedly until

@ -180,7 +180,8 @@ class Version::LevelFileNumIterator : public Iterator {
static Iterator* GetFileIterator(void* arg,
const ReadOptions& options,
const EnvOptions& soptions,
const Slice& file_value) {
const Slice& file_value,
bool for_compaction) {
TableCache* cache = reinterpret_cast<TableCache*>(arg);
if (file_value.size() != 16) {
return NewErrorIterator(
@ -189,7 +190,9 @@ static Iterator* GetFileIterator(void* arg,
return cache->NewIterator(options,
soptions,
DecodeFixed64(file_value.data()),
DecodeFixed64(file_value.data() + 8));
DecodeFixed64(file_value.data() + 8),
nullptr /* don't need reference to table*/,
for_compaction);
}
}
@ -1834,13 +1837,15 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
for (size_t i = 0; i < files.size(); i++) {
list[num++] = table_cache_->NewIterator(
options, storage_options_compactions_,
files[i]->number, files[i]->file_size);
files[i]->number, files[i]->file_size, nullptr,
true /* for compaction */);
}
} else {
// Create concatenating iterator for the files from this level
list[num++] = NewTwoLevelIterator(
new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
&GetFileIterator, table_cache_, options, storage_options_);
&GetFileIterator, table_cache_, options, storage_options_,
true /* for compaction */);
}
}
}

@ -249,6 +249,12 @@ class RandomAccessFile {
return 0; // Default implementation to prevent issues with backwards
// compatibility.
};
enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
virtual void Hint(AccessPattern pattern) {}
};
// A file abstraction for sequential writing. The implementation

@ -441,6 +441,16 @@ struct Options {
// new record will be written to the next block.
// Default is 10.
int block_size_deviation;
// If set true, will hint the underlying file system that the file
// access pattern is random, when a sst file is opened.
// Default: true
bool advise_random_on_open;
// Specify the file access pattern once a compaction is started.
// It will be applied to all input files of a compaction.
// Default: NORMAL
enum { NONE, NORMAL, SEQUENTIAL, WILLNEED } access_hint_on_compaction_start;
};
// Options that control read operations

@ -141,6 +141,24 @@ Status Table::Open(const Options& options,
return s;
}
void Table::SetAccessHintForCompaction() {
switch (rep_->options.access_hint_on_compaction_start) {
case Options::NONE:
break;
case Options::NORMAL:
rep_->file->Hint(RandomAccessFile::NORMAL);
break;
case Options::SEQUENTIAL:
rep_->file->Hint(RandomAccessFile::SEQUENTIAL);
break;
case Options::WILLNEED:
rep_->file->Hint(RandomAccessFile::WILLNEED);
break;
default:
assert(false);
}
}
void Table::ReadMeta(const Footer& footer) {
if (rep_->options.filter_policy == nullptr) {
return; // Do not need any metadata
@ -273,7 +291,8 @@ Iterator* Table::BlockReader(void* arg,
Iterator* Table::BlockReader(void* arg,
const ReadOptions& options,
const EnvOptions& soptions,
const Slice& index_value) {
const Slice& index_value,
bool for_compaction) {
return BlockReader(arg, options, index_value, nullptr);
}
@ -285,7 +304,8 @@ Iterator* Table::NewIterator(const ReadOptions& options) const {
Status Table::InternalGet(const ReadOptions& options, const Slice& k,
void* arg,
bool (*saver)(void*, const Slice&, const Slice&, bool)) {
bool (*saver)(void*, const Slice&, const Slice&,
bool)) {
Status s;
Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
bool done = false;

@ -64,13 +64,16 @@ class Table {
// REQUIRES: key is in this table.
bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
void SetAccessHintForCompaction();
private:
struct Rep;
Rep* rep_;
explicit Table(Rep* rep) { rep_ = rep; }
static Iterator* BlockReader(void*, const ReadOptions&,
const EnvOptions& soptions, const Slice&);
const EnvOptions& soptions, const Slice&,
bool for_compaction);
static Iterator* BlockReader(void*, const ReadOptions&, const Slice&,
bool* didIO);

@ -14,7 +14,8 @@ namespace leveldb {
namespace {
typedef Iterator* (*BlockFunction)(void*, const ReadOptions&,
const EnvOptions& soptions, const Slice&);
const EnvOptions& soptions, const Slice&,
bool for_compaction);
class TwoLevelIterator: public Iterator {
public:
@ -23,7 +24,8 @@ class TwoLevelIterator: public Iterator {
BlockFunction block_function,
void* arg,
const ReadOptions& options,
const EnvOptions& soptions);
const EnvOptions& soptions,
bool for_compaction);
virtual ~TwoLevelIterator();
@ -74,6 +76,7 @@ class TwoLevelIterator: public Iterator {
// If data_iter_ is non-nullptr, then "data_block_handle_" holds the
// "index_value" passed to block_function_ to create the data_iter_.
std::string data_block_handle_;
bool for_compaction_;
};
TwoLevelIterator::TwoLevelIterator(
@ -81,13 +84,15 @@ TwoLevelIterator::TwoLevelIterator(
BlockFunction block_function,
void* arg,
const ReadOptions& options,
const EnvOptions& soptions)
const EnvOptions& soptions,
bool for_compaction)
: block_function_(block_function),
arg_(arg),
options_(options),
soptions_(soptions),
index_iter_(index_iter),
data_iter_(nullptr) {
data_iter_(nullptr),
for_compaction_(for_compaction) {
}
TwoLevelIterator::~TwoLevelIterator() {
@ -168,7 +173,8 @@ void TwoLevelIterator::InitDataBlock() {
// data_iter_ is already constructed with this iterator, so
// no need to change anything
} else {
Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle);
Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle,
for_compaction_);
data_block_handle_.assign(handle.data(), handle.size());
SetDataIterator(iter);
}
@ -182,9 +188,10 @@ Iterator* NewTwoLevelIterator(
BlockFunction block_function,
void* arg,
const ReadOptions& options,
const EnvOptions& soptions) {
const EnvOptions& soptions,
bool for_compaction) {
return new TwoLevelIterator(index_iter, block_function, arg,
options, soptions);
options, soptions, for_compaction);
}
} // namespace leveldb

@ -27,10 +27,12 @@ extern Iterator* NewTwoLevelIterator(
void* arg,
const ReadOptions& options,
const EnvOptions& soptions,
const Slice& index_value),
const Slice& index_value,
bool for_compaction),
void* arg,
const ReadOptions& options,
const EnvOptions& soptions);
const EnvOptions& soptions,
bool for_compaction = false);
} // namespace leveldb

@ -207,6 +207,30 @@ class PosixRandomAccessFile: public RandomAccessFile {
return static_cast<size_t>(rid-id);
}
#endif
virtual void Hint(AccessPattern pattern) {
switch(pattern) {
case NORMAL:
posix_fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
break;
case RANDOM:
posix_fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
break;
case SEQUENTIAL:
posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
break;
case WILLNEED:
posix_fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
break;
case DONTNEED:
posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
break;
default:
assert(false);
break;
}
}
};
// mmap() based random-access

@ -71,9 +71,15 @@ Options::Options()
is_fd_close_on_exec(true),
skip_log_error_on_recovery(false),
stats_dump_period_sec(3600),
block_size_deviation (10) {
block_size_deviation (10),
advise_random_on_open(true),
access_hint_on_compaction_start(NORMAL) {
}
static const char* const access_hints[] = {
"NONE", "NORMAL", "SEQUENTIAL", "WILLNEED"
};
void
Options::Dump(Logger* log) const
{
@ -198,6 +204,10 @@ Options::Dump(Logger* log) const
stats_dump_period_sec);
Log(log," Options.block_size_deviation: %d",
block_size_deviation);
Log(log," Options.advise_random_on_open: %d",
advise_random_on_open);
Log(log," Options.access_hint_on_compaction_start: %s",
access_hints[access_hint_on_compaction_start]);
} // Options::Dump
//

Loading…
Cancel
Save