From 5582123dee8426a5191dfd5e846cea8c676c793c Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Mon, 12 Jun 2017 06:58:25 -0700 Subject: [PATCH] Sample number of reads per SST file Summary: We estimate number of reads per SST files, by updating the counter per file in sampled read requests. This information can later be used to trigger compactions to improve read performacne. Closes https://github.com/facebook/rocksdb/pull/2417 Differential Revision: D5193528 Pulled By: siying fbshipit-source-id: b4241c5ad0eaf444b61afb53f8e6290d9f5da2df --- HISTORY.md | 2 ++ db/compaction.h | 3 +- db/internal_stats.cc | 2 +- db/version_edit.h | 40 ++++++++++++++++++------ db/version_set.cc | 58 ++++++++++++++++++++++++++--------- db/version_set.h | 2 +- include/rocksdb/metadata.h | 26 ++++++++-------- monitoring/file_read_sample.h | 25 +++++++++++++++ table/get_context.cc | 2 ++ table/get_context.h | 3 ++ 10 files changed, 124 insertions(+), 39 deletions(-) create mode 100644 monitoring/file_read_sample.h diff --git a/HISTORY.md b/HISTORY.md index bbf5bb5a1..5ee1b7248 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,12 +5,14 @@ * Replace `Options::max_background_flushes`, `Options::max_background_compactions`, and `Options::base_background_compactions` all with `Options::max_background_jobs`, which automatically decides how many threads to allocate towards flush/compaction. * options.delayed_write_rate by default take the value of options.rate_limiter rate. * Replace global variable `IOStatsContext iostats_context` with `IOStatsContext* get_iostats_context()`; replace global variable `PerfContext perf_context` with `PerfContext* get_perf_context()`. +* DB property "rocksdb.sstables" now prints keys in hex form. ### New Features * Change ticker/histogram statistics implementations to use core-local storage. This improves aggregation speed compared to our previous thread-local approach, particularly for applications with many threads. * Users can pass a cache object to write buffer manager, so that they can cap memory usage for memtable and block cache using one single limit. * Flush will be triggered when 7/8 of the limit introduced by write_buffer_manager or db_write_buffer_size is triggered, so that the hard threshold is hard to hit. * Introduce WriteOptions.low_pri. If it is true, low priority writes will be throttled if the compaction is behind. +* Measure estimated number of reads per file. The information can be accessed through DB::GetColumnFamilyMetaData or "rocksdb.sstables" DB property. ## 5.5.0 (05/17/2017) ### New Features diff --git a/db/compaction.h b/db/compaction.h index 954a90a83..457c2cd07 100644 --- a/db/compaction.h +++ b/db/compaction.h @@ -101,7 +101,8 @@ class Compaction { // input level. // REQUIREMENT: "compaction_input_level" must be >= 0 and // < "input_levels()" - const std::vector* inputs(size_t compaction_input_level) { + const std::vector* inputs( + size_t compaction_input_level) const { assert(compaction_input_level < inputs_.size()); return &inputs_[compaction_input_level].files; } diff --git a/db/internal_stats.cc b/db/internal_stats.cc index c23efd565..c2a528e83 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -544,7 +544,7 @@ bool InternalStats::HandleDBStats(std::string* value, Slice suffix) { bool InternalStats::HandleSsTables(std::string* value, Slice suffix) { auto* current = cfd_->current(); - *value = current->DebugString(); + *value = current->DebugString(true, true); return true; } diff --git a/db/version_edit.h b/db/version_edit.h index bdfe81533..72b052277 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -63,18 +63,30 @@ struct FileDescriptor { uint64_t GetFileSize() const { return file_size; } }; +struct FileSampledStats { + FileSampledStats() : num_reads_sampled(0) {} + FileSampledStats(const FileSampledStats& other) { *this = other; } + FileSampledStats& operator=(const FileSampledStats& other) { + num_reads_sampled = other.num_reads_sampled.load(); + return *this; + } + + // number of user reads to this file. + mutable std::atomic num_reads_sampled; +}; + struct FileMetaData { - int refs; FileDescriptor fd; InternalKey smallest; // Smallest internal key served by table InternalKey largest; // Largest internal key served by table - bool being_compacted; // Is this file undergoing compaction? SequenceNumber smallest_seqno; // The smallest seqno in this file SequenceNumber largest_seqno; // The largest seqno in this file // Needs to be disposed when refs becomes 0. Cache::Handle* table_reader_handle; + FileSampledStats stats; + // Stats for compensating deletion entries during compaction // File size compensated by deletion entry. @@ -87,6 +99,10 @@ struct FileMetaData { uint64_t num_deletions; // the number of deletion entries. uint64_t raw_key_size; // total uncompressed key size. uint64_t raw_value_size; // total uncompressed value size. + + int refs; // Reference count + + bool being_compacted; // Is this file undergoing compaction? bool init_stats_from_file; // true if the data-entry stats of this file // has initialized from file. @@ -94,9 +110,7 @@ struct FileMetaData { // file. FileMetaData() - : refs(0), - being_compacted(false), - smallest_seqno(kMaxSequenceNumber), + : smallest_seqno(kMaxSequenceNumber), largest_seqno(0), table_reader_handle(nullptr), compensated_file_size(0), @@ -104,6 +118,8 @@ struct FileMetaData { num_deletions(0), raw_key_size(0), raw_value_size(0), + refs(0), + being_compacted(false), init_stats_from_file(false), marked_for_compaction(false) {} @@ -119,10 +135,12 @@ struct FileMetaData { } }; -// A compressed copy of file meta data that just contain -// smallest and largest key's slice +// A compressed copy of file meta data that just contain minimum data needed +// to server read operations, while still keeping the pointer to full metadata +// of the file in case it is needed. struct FdWithKeyRange { FileDescriptor fd; + FileMetaData* file_metadata; // Point to all metadata Slice smallest_key; // slice that contain smallest key Slice largest_key; // slice that contain largest key @@ -132,8 +150,12 @@ struct FdWithKeyRange { largest_key() { } - FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key) - : fd(_fd), smallest_key(_smallest_key), largest_key(_largest_key) {} + FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key, + FileMetaData* _file_metadata) + : fd(_fd), + file_metadata(_file_metadata), + smallest_key(_smallest_key), + largest_key(_largest_key) {} }; // Data structure to store an array of FdWithKeyRange in one level diff --git a/db/version_set.cc b/db/version_set.cc index ae284c085..cfd905c34 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -34,6 +34,7 @@ #include "db/pinned_iterators_manager.h" #include "db/table_cache.h" #include "db/version_builder.h" +#include "monitoring/file_read_sample.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" @@ -122,7 +123,7 @@ class FilePicker { } } - int GetCurrentLevel() { return returned_file_level_; } + int GetCurrentLevel() const { return curr_level_; } FdWithKeyRange* GetNextFile() { while (!search_ended_) { // Loops over different levels. @@ -227,9 +228,7 @@ class FilePicker { unsigned int hit_file_level_; int32_t search_left_bound_; int32_t search_right_bound_; -#ifndef NDEBUG std::vector* files_; -#endif autovector* level_files_brief_; bool search_ended_; bool is_hit_file_last_in_level_; @@ -370,6 +369,7 @@ void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, FdWithKeyRange& f = file_level->files[i]; f.fd = files[i]->fd; + f.file_metadata = files[i]; f.smallest_key = Slice(mem, smallest_size); f.largest_key = Slice(mem + smallest_size, largest_size); } @@ -437,12 +437,12 @@ namespace { class LevelFileNumIterator : public InternalIterator { public: LevelFileNumIterator(const InternalKeyComparator& icmp, - const LevelFilesBrief* flevel) + const LevelFilesBrief* flevel, bool should_sample) : icmp_(icmp), flevel_(flevel), index_(static_cast(flevel->num_files)), - current_value_(0, 0, 0) { // Marks as invalid - } + current_value_(0, 0, 0), // Marks as invalid + should_sample_(should_sample) {} virtual bool Valid() const override { return index_ < flevel_->num_files; } virtual void Seek(const Slice& target) override { index_ = FindFile(icmp_, *flevel_, target); @@ -477,6 +477,9 @@ class LevelFileNumIterator : public InternalIterator { assert(Valid()); auto file_meta = flevel_->files[index_]; + if (should_sample_) { + sample_file_read_inc(file_meta.file_metadata); + } current_value_ = file_meta.fd; return Slice(reinterpret_cast(¤t_value_), sizeof(FileDescriptor)); @@ -488,6 +491,7 @@ class LevelFileNumIterator : public InternalIterator { const LevelFilesBrief* flevel_; uint32_t index_; mutable FileDescriptor current_value_; + bool should_sample_; }; class LevelFileIteratorState : public TwoLevelIteratorState { @@ -745,13 +749,11 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { file_path = ioptions->db_paths.back().path; } files.emplace_back( - MakeTableFileName("", file->fd.GetNumber()), - file_path, - file->fd.GetFileSize(), - file->smallest_seqno, - file->largest_seqno, + MakeTableFileName("", file->fd.GetNumber()), file_path, + file->fd.GetFileSize(), file->smallest_seqno, file->largest_seqno, file->smallest.user_key().ToString(), file->largest.user_key().ToString(), + file->stats.num_reads_sampled.load(std::memory_order_relaxed), file->being_compacted); level_size += file->fd.GetFileSize(); } @@ -835,6 +837,8 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, return; } + bool should_sample = should_sample_file_read(); + auto* arena = merge_iter_builder->GetArena(); if (level == 0) { // Merge all level zero files together since they may overlap @@ -845,6 +849,15 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, range_del_agg, nullptr, cfd_->internal_stats()->GetFileReadHist(0), false, arena, false /* skip_filters */, 0 /* level */)); } + if (should_sample) { + // Count ones for every L0 files. This is done per iterator creation + // rather than Seek(), while files in other levels are recored per seek. + // If users execute one range query per iterator, there may be some + // discrepancy here. + for (FileMetaData* meta : storage_info_.LevelFiles(0)) { + sample_file_read_inc(meta); + } + } } else { // For levels > 0, we can use a concatenating iterator that sequentially // walks through the non-overlapping files in the level, opening them @@ -859,7 +872,8 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, IsFilterSkipped(level), level, range_del_agg); mem = arena->AllocateAligned(sizeof(LevelFileNumIterator)); auto* first_level_iter = new (mem) LevelFileNumIterator( - cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level)); + cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), + should_sample_file_read()); merge_iter_builder->AddIterator( NewTwoLevelIterator(state, first_level_iter, arena, false)); } @@ -984,6 +998,9 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, user_comparator(), internal_comparator()); FdWithKeyRange* f = fp.GetNextFile(); while (f != nullptr) { + if (get_context.sample()) { + sample_file_read_inc(f->file_metadata); + } *status = table_cache_->Get( read_options, *internal_comparator(), f->fd, ikey, &get_context, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), @@ -2201,13 +2218,16 @@ void Version::AddLiveFiles(std::vector* live) { } } -std::string Version::DebugString(bool hex) const { +std::string Version::DebugString(bool hex, bool print_stats) const { std::string r; for (int level = 0; level < storage_info_.num_levels_; level++) { // E.g., // --- level 1 --- // 17:123['a' .. 'd'] // 20:43['e' .. 'g'] + // + // if print_stats=true: + // 17:123['a' .. 'd'](4096) r.append("--- level "); AppendNumberTo(&r, level); r.append(" --- version# "); @@ -2223,7 +2243,14 @@ std::string Version::DebugString(bool hex) const { r.append(files[i]->smallest.DebugString(hex)); r.append(" .. "); r.append(files[i]->largest.DebugString(hex)); - r.append("]\n"); + r.append("]"); + if (print_stats) { + r.append("("); + r.append(ToString( + files[i]->stats.num_reads_sampled.load(std::memory_order_relaxed))); + r.append(")"); + } + r.append("\n"); } } return r; @@ -3533,7 +3560,8 @@ InternalIterator* VersionSet::MakeInputIterator( false /* skip_filters */, (int)which /* level */, range_del_agg), new LevelFileNumIterator(cfd->internal_comparator(), - c->input_levels(which))); + c->input_levels(which), + false /* don't sample compaction */)); } } } diff --git a/db/version_set.h b/db/version_set.h index 7d94d692c..f1f0dcb64 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -504,7 +504,7 @@ class Version { void AddLiveFiles(std::vector* live); // Return a human readable string that describes this version's contents. - std::string DebugString(bool hex = false) const; + std::string DebugString(bool hex = false, bool print_stats = false) const; // Returns the version nuber of this version uint64_t GetVersionNumber() const { return version_number_; } diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h index 5425146d7..0b8382633 100644 --- a/include/rocksdb/metadata.h +++ b/include/rocksdb/metadata.h @@ -55,17 +55,21 @@ struct LevelMetaData { // The metadata that describes a SST file. struct SstFileMetaData { SstFileMetaData() {} - SstFileMetaData(const std::string& _file_name, - const std::string& _path, uint64_t _size, - SequenceNumber _smallest_seqno, + SstFileMetaData(const std::string& _file_name, const std::string& _path, + uint64_t _size, SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno, const std::string& _smallestkey, - const std::string& _largestkey, - bool _being_compacted) : - size(_size), name(_file_name), - db_path(_path), smallest_seqno(_smallest_seqno), largest_seqno(_largest_seqno), - smallestkey(_smallestkey), largestkey(_largestkey), - being_compacted(_being_compacted) {} + const std::string& _largestkey, uint64_t _num_reads_sampled, + bool _being_compacted) + : size(_size), + name(_file_name), + db_path(_path), + smallest_seqno(_smallest_seqno), + largest_seqno(_largest_seqno), + smallestkey(_smallestkey), + largestkey(_largestkey), + num_reads_sampled(_num_reads_sampled), + being_compacted(_being_compacted) {} // File size in bytes. uint64_t size; @@ -78,6 +82,7 @@ struct SstFileMetaData { SequenceNumber largest_seqno; // Largest sequence number in file. std::string smallestkey; // Smallest user defined key in the file. std::string largestkey; // Largest user defined key in the file. + uint64_t num_reads_sampled; // How many times the file is read. bool being_compacted; // true if the file is currently being compacted. }; @@ -86,7 +91,4 @@ struct LiveFileMetaData : SstFileMetaData { std::string column_family_name; // Name of the column family int level; // Level at which this file resides. }; - - - } // namespace rocksdb diff --git a/monitoring/file_read_sample.h b/monitoring/file_read_sample.h new file mode 100644 index 000000000..2cefe5522 --- /dev/null +++ b/monitoring/file_read_sample.h @@ -0,0 +1,25 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// This source code is also licensed under the GPLv2 license found in the +// COPYING file in the root directory of this source tree. +// +#pragma once +#include "db/version_edit.h" +#include "util/random.h" + +namespace rocksdb { +static const uint32_t kFileReadSampleRate = 1024; +extern bool should_sample_file_read(); +extern void sample_file_read_inc(FileMetaData*); + +inline bool should_sample_file_read() { + return (Random::GetTLSInstance()->Next() % kFileReadSampleRate == 307); +} + +inline void sample_file_read_inc(FileMetaData* meta) { + meta->stats.num_reads_sampled.fetch_add(kFileReadSampleRate, + std::memory_order_relaxed); +} +} diff --git a/table/get_context.cc b/table/get_context.cc index 9532f3654..060bd62b9 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -8,6 +8,7 @@ #include "table/get_context.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" +#include "monitoring/file_read_sample.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "rocksdb/env.h" @@ -59,6 +60,7 @@ GetContext::GetContext(const Comparator* ucmp, if (seq_) { *seq_ = kMaxSequenceNumber; } + sample_ = should_sample_file_read(); } // Called from TableCache::Get and Table::Get when file/block in which diff --git a/table/get_context.h b/table/get_context.h index ec33368aa..d58753346 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -62,6 +62,8 @@ class GetContext { // Do we need to fetch the SequenceNumber for this key? bool NeedToReadSequence() const { return (seq_ != nullptr); } + bool sample() const { return sample_; } + private: const Comparator* ucmp_; const MergeOperator* merge_operator_; @@ -82,6 +84,7 @@ class GetContext { std::string* replay_log_; // Used to temporarily pin blocks when state_ == GetContext::kMerge PinnedIteratorsManager* pinned_iters_mgr_; + bool sample_; }; void replayGetContextLog(const Slice& replay_log, const Slice& user_key,