diff --git a/db/db_bench.cc b/db/db_bench.cc index 04f1f4ca1..abaa41507 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -293,6 +293,14 @@ static bool FLAGS_use_mmap_writes; // Allow readaheads to occur for compactions static bool FLAGS_use_readahead_compactions; +// Advise random access on table file open +static bool FLAGS_advise_random_on_open = + leveldb::Options().advise_random_on_open; + +// Access pattern advice when a file is compacted +static auto FLAGS_compaction_fadvice = + leveldb::Options().access_hint_on_compaction_start; + namespace leveldb { // Helper for quickly generating random data. @@ -900,6 +908,7 @@ unique_ptr GenerateKeyFromInt(int v, const char* suffix = "") } if (method != nullptr) { + fprintf(stdout, "DB path: [%s]\n", FLAGS_db); RunBenchmark(num_threads, name, method); } } @@ -1138,6 +1147,8 @@ unique_ptr GenerateKeyFromInt(int v, const char* suffix = "") options.allow_mmap_reads = FLAGS_use_mmap_reads; options.allow_mmap_writes = FLAGS_use_mmap_writes; options.allow_readahead_compactions = FLAGS_use_readahead_compactions; + options.advise_random_on_open = FLAGS_advise_random_on_open; + options.access_hint_on_compaction_start = FLAGS_compaction_fadvice; Status s; if(FLAGS_read_only) { s = DB::OpenForReadOnly(options, FLAGS_db, &db_); @@ -1731,8 +1742,9 @@ int main(int argc, char** argv) { int n; long l; char junk; - char hdfsname[2048]; + char buf[2048]; char str[512]; + if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { @@ -1848,8 +1860,8 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--get_approx=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { FLAGS_get_approx = n; - } else if (sscanf(argv[i], "--hdfs=%s", hdfsname) == 1) { - FLAGS_env = new leveldb::HdfsEnv(hdfsname); + } else if (sscanf(argv[i], "--hdfs=%s", buf) == 1) { + FLAGS_env = new leveldb::HdfsEnv(buf); } else if (sscanf(argv[i], "--num_levels=%d%c", &n, &junk) == 1) { FLAGS_num_levels = n; @@ -1931,6 +1943,21 @@ int main(int argc, char** argv) { FLAGS_source_compaction_factor = n; } else if (sscanf(argv[i], "--wal_ttl=%d%c", &n, &junk) == 1) { FLAGS_WAL_ttl_seconds = static_cast(n); + } else if (sscanf(argv[i], "--advise_random_on_open=%d%c", &n, &junk) == 1 + && (n == 0 || n ==1 )) { + FLAGS_advise_random_on_open = n; + } else if (sscanf(argv[i], "--compaction_fadvice=%s", buf) == 1) { + if (!strcasecmp(buf, "NONE")) + FLAGS_compaction_fadvice = leveldb::Options::NONE; + else if (!strcasecmp(buf, "NORMAL")) + FLAGS_compaction_fadvice = leveldb::Options::NORMAL; + else if (!strcasecmp(buf, "SEQUENTIAL")) + FLAGS_compaction_fadvice = leveldb::Options::SEQUENTIAL; + else if (!strcasecmp(buf, "WILLNEED")) + FLAGS_compaction_fadvice = leveldb::Options::WILLNEED; + else { + fprintf(stdout, "Unknown compaction fadvice:%s\n", buf); + } } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); diff --git a/db/table_cache.cc b/db/table_cache.cc index 9af91fea6..9764119d3 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -54,6 +54,9 @@ Status TableCache::FindTable(const EnvOptions& toptions, s = env_->NewRandomAccessFile(fname, &file, toptions); RecordTick(options_->statistics, NO_FILE_OPENS); if (s.ok()) { + if (options_->advise_random_on_open) { + file->Hint(RandomAccessFile::RANDOM); + } s = Table::Open(*options_, toptions, std::move(file), file_size, &table); } @@ -74,7 +77,8 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, const EnvOptions& toptions, uint64_t file_number, uint64_t file_size, - Table** tableptr) { + Table** tableptr, + bool for_compaction) { if (tableptr != nullptr) { *tableptr = nullptr; } @@ -92,6 +96,11 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, if (tableptr != nullptr) { *tableptr = table; } + + if (for_compaction) { + table->SetAccessHintForCompaction(); + } + return result; } diff --git a/db/table_cache.h b/db/table_cache.h index f4dc3c86a..83644b1da 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -37,7 +37,8 @@ class TableCache { const EnvOptions& toptions, uint64_t file_number, uint64_t file_size, - Table** tableptr = nullptr); + Table** tableptr = nullptr, + bool for_compaction = false); // If a seek to internal key "k" in specified file finds an entry, // call (*handle_result)(arg, found_key, found_value) repeatedly until diff --git a/db/version_set.cc b/db/version_set.cc index 381ca6b60..f3d192658 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -180,7 +180,8 @@ class Version::LevelFileNumIterator : public Iterator { static Iterator* GetFileIterator(void* arg, const ReadOptions& options, const EnvOptions& soptions, - const Slice& file_value) { + const Slice& file_value, + bool for_compaction) { TableCache* cache = reinterpret_cast(arg); if (file_value.size() != 16) { return NewErrorIterator( @@ -189,7 +190,9 @@ static Iterator* GetFileIterator(void* arg, return cache->NewIterator(options, soptions, DecodeFixed64(file_value.data()), - DecodeFixed64(file_value.data() + 8)); + DecodeFixed64(file_value.data() + 8), + nullptr /* don't need reference to table*/, + for_compaction); } } @@ -1834,13 +1837,15 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { for (size_t i = 0; i < files.size(); i++) { list[num++] = table_cache_->NewIterator( options, storage_options_compactions_, - files[i]->number, files[i]->file_size); + files[i]->number, files[i]->file_size, nullptr, + true /* for compaction */); } } else { // Create concatenating iterator for the files from this level list[num++] = NewTwoLevelIterator( new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]), - &GetFileIterator, table_cache_, options, storage_options_); + &GetFileIterator, table_cache_, options, storage_options_, + true /* for compaction */); } } } diff --git a/include/leveldb/env.h b/include/leveldb/env.h index 6e163726c..a3e33d6c1 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -249,6 +249,12 @@ class RandomAccessFile { return 0; // Default implementation to prevent issues with backwards // compatibility. }; + + + enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED }; + + virtual void Hint(AccessPattern pattern) {} + }; // A file abstraction for sequential writing. The implementation diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 206547102..5085e160e 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -441,6 +441,16 @@ struct Options { // new record will be written to the next block. // Default is 10. int block_size_deviation; + + // If set true, will hint the underlying file system that the file + // access pattern is random, when a sst file is opened. + // Default: true + bool advise_random_on_open; + + // Specify the file access pattern once a compaction is started. + // It will be applied to all input files of a compaction. + // Default: NORMAL + enum { NONE, NORMAL, SEQUENTIAL, WILLNEED } access_hint_on_compaction_start; }; // Options that control read operations diff --git a/table/table.cc b/table/table.cc index 386b9e074..06a472f2a 100644 --- a/table/table.cc +++ b/table/table.cc @@ -141,6 +141,24 @@ Status Table::Open(const Options& options, return s; } +void Table::SetAccessHintForCompaction() { + switch (rep_->options.access_hint_on_compaction_start) { + case Options::NONE: + break; + case Options::NORMAL: + rep_->file->Hint(RandomAccessFile::NORMAL); + break; + case Options::SEQUENTIAL: + rep_->file->Hint(RandomAccessFile::SEQUENTIAL); + break; + case Options::WILLNEED: + rep_->file->Hint(RandomAccessFile::WILLNEED); + break; + default: + assert(false); + } +} + void Table::ReadMeta(const Footer& footer) { if (rep_->options.filter_policy == nullptr) { return; // Do not need any metadata @@ -273,7 +291,8 @@ Iterator* Table::BlockReader(void* arg, Iterator* Table::BlockReader(void* arg, const ReadOptions& options, const EnvOptions& soptions, - const Slice& index_value) { + const Slice& index_value, + bool for_compaction) { return BlockReader(arg, options, index_value, nullptr); } @@ -285,7 +304,8 @@ Iterator* Table::NewIterator(const ReadOptions& options) const { Status Table::InternalGet(const ReadOptions& options, const Slice& k, void* arg, - bool (*saver)(void*, const Slice&, const Slice&, bool)) { + bool (*saver)(void*, const Slice&, const Slice&, + bool)) { Status s; Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); bool done = false; diff --git a/table/table.h b/table/table.h index bb0bd0385..2baf667b5 100644 --- a/table/table.h +++ b/table/table.h @@ -64,13 +64,16 @@ class Table { // REQUIRES: key is in this table. bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); + void SetAccessHintForCompaction(); + private: struct Rep; Rep* rep_; explicit Table(Rep* rep) { rep_ = rep; } static Iterator* BlockReader(void*, const ReadOptions&, - const EnvOptions& soptions, const Slice&); + const EnvOptions& soptions, const Slice&, + bool for_compaction); static Iterator* BlockReader(void*, const ReadOptions&, const Slice&, bool* didIO); diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index 3d9989739..8af7f18ed 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -14,7 +14,8 @@ namespace leveldb { namespace { typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, - const EnvOptions& soptions, const Slice&); + const EnvOptions& soptions, const Slice&, + bool for_compaction); class TwoLevelIterator: public Iterator { public: @@ -23,7 +24,8 @@ class TwoLevelIterator: public Iterator { BlockFunction block_function, void* arg, const ReadOptions& options, - const EnvOptions& soptions); + const EnvOptions& soptions, + bool for_compaction); virtual ~TwoLevelIterator(); @@ -74,6 +76,7 @@ class TwoLevelIterator: public Iterator { // If data_iter_ is non-nullptr, then "data_block_handle_" holds the // "index_value" passed to block_function_ to create the data_iter_. std::string data_block_handle_; + bool for_compaction_; }; TwoLevelIterator::TwoLevelIterator( @@ -81,13 +84,15 @@ TwoLevelIterator::TwoLevelIterator( BlockFunction block_function, void* arg, const ReadOptions& options, - const EnvOptions& soptions) + const EnvOptions& soptions, + bool for_compaction) : block_function_(block_function), arg_(arg), options_(options), soptions_(soptions), index_iter_(index_iter), - data_iter_(nullptr) { + data_iter_(nullptr), + for_compaction_(for_compaction) { } TwoLevelIterator::~TwoLevelIterator() { @@ -168,7 +173,8 @@ void TwoLevelIterator::InitDataBlock() { // data_iter_ is already constructed with this iterator, so // no need to change anything } else { - Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle); + Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle, + for_compaction_); data_block_handle_.assign(handle.data(), handle.size()); SetDataIterator(iter); } @@ -182,9 +188,10 @@ Iterator* NewTwoLevelIterator( BlockFunction block_function, void* arg, const ReadOptions& options, - const EnvOptions& soptions) { + const EnvOptions& soptions, + bool for_compaction) { return new TwoLevelIterator(index_iter, block_function, arg, - options, soptions); + options, soptions, for_compaction); } } // namespace leveldb diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index e1b376638..bb2828224 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -27,10 +27,12 @@ extern Iterator* NewTwoLevelIterator( void* arg, const ReadOptions& options, const EnvOptions& soptions, - const Slice& index_value), + const Slice& index_value, + bool for_compaction), void* arg, const ReadOptions& options, - const EnvOptions& soptions); + const EnvOptions& soptions, + bool for_compaction = false); } // namespace leveldb diff --git a/util/env_posix.cc b/util/env_posix.cc index 78afcddd6..3c111bca3 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -207,6 +207,30 @@ class PosixRandomAccessFile: public RandomAccessFile { return static_cast(rid-id); } #endif + + virtual void Hint(AccessPattern pattern) { + switch(pattern) { + case NORMAL: + posix_fadvise(fd_, 0, 0, POSIX_FADV_NORMAL); + break; + case RANDOM: + posix_fadvise(fd_, 0, 0, POSIX_FADV_RANDOM); + break; + case SEQUENTIAL: + posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); + break; + case WILLNEED: + posix_fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED); + break; + case DONTNEED: + posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); + break; + default: + assert(false); + break; + } + } + }; // mmap() based random-access diff --git a/util/options.cc b/util/options.cc index e3c041c37..6f7731d8d 100644 --- a/util/options.cc +++ b/util/options.cc @@ -71,9 +71,15 @@ Options::Options() is_fd_close_on_exec(true), skip_log_error_on_recovery(false), stats_dump_period_sec(3600), - block_size_deviation (10) { + block_size_deviation (10), + advise_random_on_open(true), + access_hint_on_compaction_start(NORMAL) { } +static const char* const access_hints[] = { + "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" +}; + void Options::Dump(Logger* log) const { @@ -198,6 +204,10 @@ Options::Dump(Logger* log) const stats_dump_period_sec); Log(log," Options.block_size_deviation: %d", block_size_deviation); + Log(log," Options.advise_random_on_open: %d", + advise_random_on_open); + Log(log," Options.access_hint_on_compaction_start: %s", + access_hints[access_hint_on_compaction_start]); } // Options::Dump //