From ab8d2f6ab244b0a1c54f606f54881d96e5a4cfeb Mon Sep 17 00:00:00 2001 From: Haobo Xu Date: Fri, 17 May 2013 15:53:01 -0700 Subject: [PATCH] [RocksDB] [Performance] Allow different posix advice to be applied to the same table file Summary: Current posix advice implementation ties up the access pattern hint with the creation of a file. It is not possible to apply different advice for different access (random get vs compaction read), without keeping two open files for the same table. This patch extended the RandomeAccessFile interface to accept new access hint at anytime. Particularly, we are able to set different access hint on the same table file based on when/how the file is used. Two options are added to set the access hint, after the file is first opened and after the file is being compacted. Test Plan: make check; db_stress; db_bench Reviewers: dhruba Reviewed By: dhruba CC: MarkCallaghan, leveldb Differential Revision: https://reviews.facebook.net/D10905 --- db/db_bench.cc | 33 ++++++++++++++++++++++++++++++--- db/table_cache.cc | 11 ++++++++++- db/table_cache.h | 3 ++- db/version_set.cc | 13 +++++++++---- include/leveldb/env.h | 6 ++++++ include/leveldb/options.h | 10 ++++++++++ table/table.cc | 24 ++++++++++++++++++++++-- table/table.h | 5 ++++- table/two_level_iterator.cc | 21 ++++++++++++++------- table/two_level_iterator.h | 6 ++++-- util/env_posix.cc | 24 ++++++++++++++++++++++++ util/options.cc | 12 +++++++++++- 12 files changed, 146 insertions(+), 22 deletions(-) diff --git a/db/db_bench.cc b/db/db_bench.cc index 04f1f4ca1..abaa41507 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -293,6 +293,14 @@ static bool FLAGS_use_mmap_writes; // Allow readaheads to occur for compactions static bool FLAGS_use_readahead_compactions; +// Advise random access on table file open +static bool FLAGS_advise_random_on_open = + leveldb::Options().advise_random_on_open; + +// Access pattern advice when a file is compacted +static auto FLAGS_compaction_fadvice = + leveldb::Options().access_hint_on_compaction_start; + namespace leveldb { // Helper for quickly generating random data. @@ -900,6 +908,7 @@ unique_ptr GenerateKeyFromInt(int v, const char* suffix = "") } if (method != nullptr) { + fprintf(stdout, "DB path: [%s]\n", FLAGS_db); RunBenchmark(num_threads, name, method); } } @@ -1138,6 +1147,8 @@ unique_ptr GenerateKeyFromInt(int v, const char* suffix = "") options.allow_mmap_reads = FLAGS_use_mmap_reads; options.allow_mmap_writes = FLAGS_use_mmap_writes; options.allow_readahead_compactions = FLAGS_use_readahead_compactions; + options.advise_random_on_open = FLAGS_advise_random_on_open; + options.access_hint_on_compaction_start = FLAGS_compaction_fadvice; Status s; if(FLAGS_read_only) { s = DB::OpenForReadOnly(options, FLAGS_db, &db_); @@ -1731,8 +1742,9 @@ int main(int argc, char** argv) { int n; long l; char junk; - char hdfsname[2048]; + char buf[2048]; char str[512]; + if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { @@ -1848,8 +1860,8 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--get_approx=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { FLAGS_get_approx = n; - } else if (sscanf(argv[i], "--hdfs=%s", hdfsname) == 1) { - FLAGS_env = new leveldb::HdfsEnv(hdfsname); + } else if (sscanf(argv[i], "--hdfs=%s", buf) == 1) { + FLAGS_env = new leveldb::HdfsEnv(buf); } else if (sscanf(argv[i], "--num_levels=%d%c", &n, &junk) == 1) { FLAGS_num_levels = n; @@ -1931,6 +1943,21 @@ int main(int argc, char** argv) { FLAGS_source_compaction_factor = n; } else if (sscanf(argv[i], "--wal_ttl=%d%c", &n, &junk) == 1) { FLAGS_WAL_ttl_seconds = static_cast(n); + } else if (sscanf(argv[i], "--advise_random_on_open=%d%c", &n, &junk) == 1 + && (n == 0 || n ==1 )) { + FLAGS_advise_random_on_open = n; + } else if (sscanf(argv[i], "--compaction_fadvice=%s", buf) == 1) { + if (!strcasecmp(buf, "NONE")) + FLAGS_compaction_fadvice = leveldb::Options::NONE; + else if (!strcasecmp(buf, "NORMAL")) + FLAGS_compaction_fadvice = leveldb::Options::NORMAL; + else if (!strcasecmp(buf, "SEQUENTIAL")) + FLAGS_compaction_fadvice = leveldb::Options::SEQUENTIAL; + else if (!strcasecmp(buf, "WILLNEED")) + FLAGS_compaction_fadvice = leveldb::Options::WILLNEED; + else { + fprintf(stdout, "Unknown compaction fadvice:%s\n", buf); + } } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); diff --git a/db/table_cache.cc b/db/table_cache.cc index 9af91fea6..9764119d3 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -54,6 +54,9 @@ Status TableCache::FindTable(const EnvOptions& toptions, s = env_->NewRandomAccessFile(fname, &file, toptions); RecordTick(options_->statistics, NO_FILE_OPENS); if (s.ok()) { + if (options_->advise_random_on_open) { + file->Hint(RandomAccessFile::RANDOM); + } s = Table::Open(*options_, toptions, std::move(file), file_size, &table); } @@ -74,7 +77,8 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, const EnvOptions& toptions, uint64_t file_number, uint64_t file_size, - Table** tableptr) { + Table** tableptr, + bool for_compaction) { if (tableptr != nullptr) { *tableptr = nullptr; } @@ -92,6 +96,11 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, if (tableptr != nullptr) { *tableptr = table; } + + if (for_compaction) { + table->SetAccessHintForCompaction(); + } + return result; } diff --git a/db/table_cache.h b/db/table_cache.h index f4dc3c86a..83644b1da 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -37,7 +37,8 @@ class TableCache { const EnvOptions& toptions, uint64_t file_number, uint64_t file_size, - Table** tableptr = nullptr); + Table** tableptr = nullptr, + bool for_compaction = false); // If a seek to internal key "k" in specified file finds an entry, // call (*handle_result)(arg, found_key, found_value) repeatedly until diff --git a/db/version_set.cc b/db/version_set.cc index 381ca6b60..f3d192658 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -180,7 +180,8 @@ class Version::LevelFileNumIterator : public Iterator { static Iterator* GetFileIterator(void* arg, const ReadOptions& options, const EnvOptions& soptions, - const Slice& file_value) { + const Slice& file_value, + bool for_compaction) { TableCache* cache = reinterpret_cast(arg); if (file_value.size() != 16) { return NewErrorIterator( @@ -189,7 +190,9 @@ static Iterator* GetFileIterator(void* arg, return cache->NewIterator(options, soptions, DecodeFixed64(file_value.data()), - DecodeFixed64(file_value.data() + 8)); + DecodeFixed64(file_value.data() + 8), + nullptr /* don't need reference to table*/, + for_compaction); } } @@ -1834,13 +1837,15 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { for (size_t i = 0; i < files.size(); i++) { list[num++] = table_cache_->NewIterator( options, storage_options_compactions_, - files[i]->number, files[i]->file_size); + files[i]->number, files[i]->file_size, nullptr, + true /* for compaction */); } } else { // Create concatenating iterator for the files from this level list[num++] = NewTwoLevelIterator( new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]), - &GetFileIterator, table_cache_, options, storage_options_); + &GetFileIterator, table_cache_, options, storage_options_, + true /* for compaction */); } } } diff --git a/include/leveldb/env.h b/include/leveldb/env.h index 6e163726c..a3e33d6c1 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -249,6 +249,12 @@ class RandomAccessFile { return 0; // Default implementation to prevent issues with backwards // compatibility. }; + + + enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED }; + + virtual void Hint(AccessPattern pattern) {} + }; // A file abstraction for sequential writing. The implementation diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 206547102..5085e160e 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -441,6 +441,16 @@ struct Options { // new record will be written to the next block. // Default is 10. int block_size_deviation; + + // If set true, will hint the underlying file system that the file + // access pattern is random, when a sst file is opened. + // Default: true + bool advise_random_on_open; + + // Specify the file access pattern once a compaction is started. + // It will be applied to all input files of a compaction. + // Default: NORMAL + enum { NONE, NORMAL, SEQUENTIAL, WILLNEED } access_hint_on_compaction_start; }; // Options that control read operations diff --git a/table/table.cc b/table/table.cc index 386b9e074..06a472f2a 100644 --- a/table/table.cc +++ b/table/table.cc @@ -141,6 +141,24 @@ Status Table::Open(const Options& options, return s; } +void Table::SetAccessHintForCompaction() { + switch (rep_->options.access_hint_on_compaction_start) { + case Options::NONE: + break; + case Options::NORMAL: + rep_->file->Hint(RandomAccessFile::NORMAL); + break; + case Options::SEQUENTIAL: + rep_->file->Hint(RandomAccessFile::SEQUENTIAL); + break; + case Options::WILLNEED: + rep_->file->Hint(RandomAccessFile::WILLNEED); + break; + default: + assert(false); + } +} + void Table::ReadMeta(const Footer& footer) { if (rep_->options.filter_policy == nullptr) { return; // Do not need any metadata @@ -273,7 +291,8 @@ Iterator* Table::BlockReader(void* arg, Iterator* Table::BlockReader(void* arg, const ReadOptions& options, const EnvOptions& soptions, - const Slice& index_value) { + const Slice& index_value, + bool for_compaction) { return BlockReader(arg, options, index_value, nullptr); } @@ -285,7 +304,8 @@ Iterator* Table::NewIterator(const ReadOptions& options) const { Status Table::InternalGet(const ReadOptions& options, const Slice& k, void* arg, - bool (*saver)(void*, const Slice&, const Slice&, bool)) { + bool (*saver)(void*, const Slice&, const Slice&, + bool)) { Status s; Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); bool done = false; diff --git a/table/table.h b/table/table.h index bb0bd0385..2baf667b5 100644 --- a/table/table.h +++ b/table/table.h @@ -64,13 +64,16 @@ class Table { // REQUIRES: key is in this table. bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); + void SetAccessHintForCompaction(); + private: struct Rep; Rep* rep_; explicit Table(Rep* rep) { rep_ = rep; } static Iterator* BlockReader(void*, const ReadOptions&, - const EnvOptions& soptions, const Slice&); + const EnvOptions& soptions, const Slice&, + bool for_compaction); static Iterator* BlockReader(void*, const ReadOptions&, const Slice&, bool* didIO); diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index 3d9989739..8af7f18ed 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -14,7 +14,8 @@ namespace leveldb { namespace { typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, - const EnvOptions& soptions, const Slice&); + const EnvOptions& soptions, const Slice&, + bool for_compaction); class TwoLevelIterator: public Iterator { public: @@ -23,7 +24,8 @@ class TwoLevelIterator: public Iterator { BlockFunction block_function, void* arg, const ReadOptions& options, - const EnvOptions& soptions); + const EnvOptions& soptions, + bool for_compaction); virtual ~TwoLevelIterator(); @@ -74,6 +76,7 @@ class TwoLevelIterator: public Iterator { // If data_iter_ is non-nullptr, then "data_block_handle_" holds the // "index_value" passed to block_function_ to create the data_iter_. std::string data_block_handle_; + bool for_compaction_; }; TwoLevelIterator::TwoLevelIterator( @@ -81,13 +84,15 @@ TwoLevelIterator::TwoLevelIterator( BlockFunction block_function, void* arg, const ReadOptions& options, - const EnvOptions& soptions) + const EnvOptions& soptions, + bool for_compaction) : block_function_(block_function), arg_(arg), options_(options), soptions_(soptions), index_iter_(index_iter), - data_iter_(nullptr) { + data_iter_(nullptr), + for_compaction_(for_compaction) { } TwoLevelIterator::~TwoLevelIterator() { @@ -168,7 +173,8 @@ void TwoLevelIterator::InitDataBlock() { // data_iter_ is already constructed with this iterator, so // no need to change anything } else { - Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle); + Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle, + for_compaction_); data_block_handle_.assign(handle.data(), handle.size()); SetDataIterator(iter); } @@ -182,9 +188,10 @@ Iterator* NewTwoLevelIterator( BlockFunction block_function, void* arg, const ReadOptions& options, - const EnvOptions& soptions) { + const EnvOptions& soptions, + bool for_compaction) { return new TwoLevelIterator(index_iter, block_function, arg, - options, soptions); + options, soptions, for_compaction); } } // namespace leveldb diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index e1b376638..bb2828224 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -27,10 +27,12 @@ extern Iterator* NewTwoLevelIterator( void* arg, const ReadOptions& options, const EnvOptions& soptions, - const Slice& index_value), + const Slice& index_value, + bool for_compaction), void* arg, const ReadOptions& options, - const EnvOptions& soptions); + const EnvOptions& soptions, + bool for_compaction = false); } // namespace leveldb diff --git a/util/env_posix.cc b/util/env_posix.cc index 78afcddd6..3c111bca3 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -207,6 +207,30 @@ class PosixRandomAccessFile: public RandomAccessFile { return static_cast(rid-id); } #endif + + virtual void Hint(AccessPattern pattern) { + switch(pattern) { + case NORMAL: + posix_fadvise(fd_, 0, 0, POSIX_FADV_NORMAL); + break; + case RANDOM: + posix_fadvise(fd_, 0, 0, POSIX_FADV_RANDOM); + break; + case SEQUENTIAL: + posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); + break; + case WILLNEED: + posix_fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED); + break; + case DONTNEED: + posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); + break; + default: + assert(false); + break; + } + } + }; // mmap() based random-access diff --git a/util/options.cc b/util/options.cc index e3c041c37..6f7731d8d 100644 --- a/util/options.cc +++ b/util/options.cc @@ -71,9 +71,15 @@ Options::Options() is_fd_close_on_exec(true), skip_log_error_on_recovery(false), stats_dump_period_sec(3600), - block_size_deviation (10) { + block_size_deviation (10), + advise_random_on_open(true), + access_hint_on_compaction_start(NORMAL) { } +static const char* const access_hints[] = { + "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" +}; + void Options::Dump(Logger* log) const { @@ -198,6 +204,10 @@ Options::Dump(Logger* log) const stats_dump_period_sec); Log(log," Options.block_size_deviation: %d", block_size_deviation); + Log(log," Options.advise_random_on_open: %d", + advise_random_on_open); + Log(log," Options.access_hint_on_compaction_start: %s", + access_hints[access_hint_on_compaction_start]); } // Options::Dump //