Sample number of reads per SST file

Summary:
We estimate number of reads per SST files, by updating the counter per file in sampled read requests. This information can later be used to trigger compactions to improve read performacne.
Closes https://github.com/facebook/rocksdb/pull/2417

Differential Revision: D5193528

Pulled By: siying

fbshipit-source-id: b4241c5ad0eaf444b61afb53f8e6290d9f5da2df
main
Siying Dong 7 years ago committed by Facebook Github Bot
parent db818d2d1a
commit 5582123dee
  1. 2
      HISTORY.md
  2. 3
      db/compaction.h
  3. 2
      db/internal_stats.cc
  4. 40
      db/version_edit.h
  5. 58
      db/version_set.cc
  6. 2
      db/version_set.h
  7. 26
      include/rocksdb/metadata.h
  8. 25
      monitoring/file_read_sample.h
  9. 2
      table/get_context.cc
  10. 3
      table/get_context.h

@ -5,12 +5,14 @@
* Replace `Options::max_background_flushes`, `Options::max_background_compactions`, and `Options::base_background_compactions` all with `Options::max_background_jobs`, which automatically decides how many threads to allocate towards flush/compaction.
* options.delayed_write_rate by default take the value of options.rate_limiter rate.
* Replace global variable `IOStatsContext iostats_context` with `IOStatsContext* get_iostats_context()`; replace global variable `PerfContext perf_context` with `PerfContext* get_perf_context()`.
* DB property "rocksdb.sstables" now prints keys in hex form.
### New Features
* Change ticker/histogram statistics implementations to use core-local storage. This improves aggregation speed compared to our previous thread-local approach, particularly for applications with many threads.
* Users can pass a cache object to write buffer manager, so that they can cap memory usage for memtable and block cache using one single limit.
* Flush will be triggered when 7/8 of the limit introduced by write_buffer_manager or db_write_buffer_size is triggered, so that the hard threshold is hard to hit.
* Introduce WriteOptions.low_pri. If it is true, low priority writes will be throttled if the compaction is behind.
* Measure estimated number of reads per file. The information can be accessed through DB::GetColumnFamilyMetaData or "rocksdb.sstables" DB property.
## 5.5.0 (05/17/2017)
### New Features

@ -101,7 +101,8 @@ class Compaction {
// input level.
// REQUIREMENT: "compaction_input_level" must be >= 0 and
// < "input_levels()"
const std::vector<FileMetaData*>* inputs(size_t compaction_input_level) {
const std::vector<FileMetaData*>* inputs(
size_t compaction_input_level) const {
assert(compaction_input_level < inputs_.size());
return &inputs_[compaction_input_level].files;
}

@ -544,7 +544,7 @@ bool InternalStats::HandleDBStats(std::string* value, Slice suffix) {
bool InternalStats::HandleSsTables(std::string* value, Slice suffix) {
auto* current = cfd_->current();
*value = current->DebugString();
*value = current->DebugString(true, true);
return true;
}

@ -63,18 +63,30 @@ struct FileDescriptor {
uint64_t GetFileSize() const { return file_size; }
};
struct FileSampledStats {
FileSampledStats() : num_reads_sampled(0) {}
FileSampledStats(const FileSampledStats& other) { *this = other; }
FileSampledStats& operator=(const FileSampledStats& other) {
num_reads_sampled = other.num_reads_sampled.load();
return *this;
}
// number of user reads to this file.
mutable std::atomic<uint64_t> num_reads_sampled;
};
struct FileMetaData {
int refs;
FileDescriptor fd;
InternalKey smallest; // Smallest internal key served by table
InternalKey largest; // Largest internal key served by table
bool being_compacted; // Is this file undergoing compaction?
SequenceNumber smallest_seqno; // The smallest seqno in this file
SequenceNumber largest_seqno; // The largest seqno in this file
// Needs to be disposed when refs becomes 0.
Cache::Handle* table_reader_handle;
FileSampledStats stats;
// Stats for compensating deletion entries during compaction
// File size compensated by deletion entry.
@ -87,6 +99,10 @@ struct FileMetaData {
uint64_t num_deletions; // the number of deletion entries.
uint64_t raw_key_size; // total uncompressed key size.
uint64_t raw_value_size; // total uncompressed value size.
int refs; // Reference count
bool being_compacted; // Is this file undergoing compaction?
bool init_stats_from_file; // true if the data-entry stats of this file
// has initialized from file.
@ -94,9 +110,7 @@ struct FileMetaData {
// file.
FileMetaData()
: refs(0),
being_compacted(false),
smallest_seqno(kMaxSequenceNumber),
: smallest_seqno(kMaxSequenceNumber),
largest_seqno(0),
table_reader_handle(nullptr),
compensated_file_size(0),
@ -104,6 +118,8 @@ struct FileMetaData {
num_deletions(0),
raw_key_size(0),
raw_value_size(0),
refs(0),
being_compacted(false),
init_stats_from_file(false),
marked_for_compaction(false) {}
@ -119,10 +135,12 @@ struct FileMetaData {
}
};
// A compressed copy of file meta data that just contain
// smallest and largest key's slice
// A compressed copy of file meta data that just contain minimum data needed
// to server read operations, while still keeping the pointer to full metadata
// of the file in case it is needed.
struct FdWithKeyRange {
FileDescriptor fd;
FileMetaData* file_metadata; // Point to all metadata
Slice smallest_key; // slice that contain smallest key
Slice largest_key; // slice that contain largest key
@ -132,8 +150,12 @@ struct FdWithKeyRange {
largest_key() {
}
FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key)
: fd(_fd), smallest_key(_smallest_key), largest_key(_largest_key) {}
FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key,
FileMetaData* _file_metadata)
: fd(_fd),
file_metadata(_file_metadata),
smallest_key(_smallest_key),
largest_key(_largest_key) {}
};
// Data structure to store an array of FdWithKeyRange in one level

@ -34,6 +34,7 @@
#include "db/pinned_iterators_manager.h"
#include "db/table_cache.h"
#include "db/version_builder.h"
#include "monitoring/file_read_sample.h"
#include "monitoring/perf_context_imp.h"
#include "rocksdb/env.h"
#include "rocksdb/merge_operator.h"
@ -122,7 +123,7 @@ class FilePicker {
}
}
int GetCurrentLevel() { return returned_file_level_; }
int GetCurrentLevel() const { return curr_level_; }
FdWithKeyRange* GetNextFile() {
while (!search_ended_) { // Loops over different levels.
@ -227,9 +228,7 @@ class FilePicker {
unsigned int hit_file_level_;
int32_t search_left_bound_;
int32_t search_right_bound_;
#ifndef NDEBUG
std::vector<FileMetaData*>* files_;
#endif
autovector<LevelFilesBrief>* level_files_brief_;
bool search_ended_;
bool is_hit_file_last_in_level_;
@ -370,6 +369,7 @@ void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
FdWithKeyRange& f = file_level->files[i];
f.fd = files[i]->fd;
f.file_metadata = files[i];
f.smallest_key = Slice(mem, smallest_size);
f.largest_key = Slice(mem + smallest_size, largest_size);
}
@ -437,12 +437,12 @@ namespace {
class LevelFileNumIterator : public InternalIterator {
public:
LevelFileNumIterator(const InternalKeyComparator& icmp,
const LevelFilesBrief* flevel)
const LevelFilesBrief* flevel, bool should_sample)
: icmp_(icmp),
flevel_(flevel),
index_(static_cast<uint32_t>(flevel->num_files)),
current_value_(0, 0, 0) { // Marks as invalid
}
current_value_(0, 0, 0), // Marks as invalid
should_sample_(should_sample) {}
virtual bool Valid() const override { return index_ < flevel_->num_files; }
virtual void Seek(const Slice& target) override {
index_ = FindFile(icmp_, *flevel_, target);
@ -477,6 +477,9 @@ class LevelFileNumIterator : public InternalIterator {
assert(Valid());
auto file_meta = flevel_->files[index_];
if (should_sample_) {
sample_file_read_inc(file_meta.file_metadata);
}
current_value_ = file_meta.fd;
return Slice(reinterpret_cast<const char*>(&current_value_),
sizeof(FileDescriptor));
@ -488,6 +491,7 @@ class LevelFileNumIterator : public InternalIterator {
const LevelFilesBrief* flevel_;
uint32_t index_;
mutable FileDescriptor current_value_;
bool should_sample_;
};
class LevelFileIteratorState : public TwoLevelIteratorState {
@ -745,13 +749,11 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
file_path = ioptions->db_paths.back().path;
}
files.emplace_back(
MakeTableFileName("", file->fd.GetNumber()),
file_path,
file->fd.GetFileSize(),
file->smallest_seqno,
file->largest_seqno,
MakeTableFileName("", file->fd.GetNumber()), file_path,
file->fd.GetFileSize(), file->smallest_seqno, file->largest_seqno,
file->smallest.user_key().ToString(),
file->largest.user_key().ToString(),
file->stats.num_reads_sampled.load(std::memory_order_relaxed),
file->being_compacted);
level_size += file->fd.GetFileSize();
}
@ -835,6 +837,8 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
return;
}
bool should_sample = should_sample_file_read();
auto* arena = merge_iter_builder->GetArena();
if (level == 0) {
// Merge all level zero files together since they may overlap
@ -845,6 +849,15 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
range_del_agg, nullptr, cfd_->internal_stats()->GetFileReadHist(0),
false, arena, false /* skip_filters */, 0 /* level */));
}
if (should_sample) {
// Count ones for every L0 files. This is done per iterator creation
// rather than Seek(), while files in other levels are recored per seek.
// If users execute one range query per iterator, there may be some
// discrepancy here.
for (FileMetaData* meta : storage_info_.LevelFiles(0)) {
sample_file_read_inc(meta);
}
}
} else {
// For levels > 0, we can use a concatenating iterator that sequentially
// walks through the non-overlapping files in the level, opening them
@ -859,7 +872,8 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
IsFilterSkipped(level), level, range_del_agg);
mem = arena->AllocateAligned(sizeof(LevelFileNumIterator));
auto* first_level_iter = new (mem) LevelFileNumIterator(
cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level));
cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
should_sample_file_read());
merge_iter_builder->AddIterator(
NewTwoLevelIterator(state, first_level_iter, arena, false));
}
@ -984,6 +998,9 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
user_comparator(), internal_comparator());
FdWithKeyRange* f = fp.GetNextFile();
while (f != nullptr) {
if (get_context.sample()) {
sample_file_read_inc(f->file_metadata);
}
*status = table_cache_->Get(
read_options, *internal_comparator(), f->fd, ikey, &get_context,
cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
@ -2201,13 +2218,16 @@ void Version::AddLiveFiles(std::vector<FileDescriptor>* live) {
}
}
std::string Version::DebugString(bool hex) const {
std::string Version::DebugString(bool hex, bool print_stats) const {
std::string r;
for (int level = 0; level < storage_info_.num_levels_; level++) {
// E.g.,
// --- level 1 ---
// 17:123['a' .. 'd']
// 20:43['e' .. 'g']
//
// if print_stats=true:
// 17:123['a' .. 'd'](4096)
r.append("--- level ");
AppendNumberTo(&r, level);
r.append(" --- version# ");
@ -2223,7 +2243,14 @@ std::string Version::DebugString(bool hex) const {
r.append(files[i]->smallest.DebugString(hex));
r.append(" .. ");
r.append(files[i]->largest.DebugString(hex));
r.append("]\n");
r.append("]");
if (print_stats) {
r.append("(");
r.append(ToString(
files[i]->stats.num_reads_sampled.load(std::memory_order_relaxed)));
r.append(")");
}
r.append("\n");
}
}
return r;
@ -3533,7 +3560,8 @@ InternalIterator* VersionSet::MakeInputIterator(
false /* skip_filters */, (int)which /* level */,
range_del_agg),
new LevelFileNumIterator(cfd->internal_comparator(),
c->input_levels(which)));
c->input_levels(which),
false /* don't sample compaction */));
}
}
}

@ -504,7 +504,7 @@ class Version {
void AddLiveFiles(std::vector<FileDescriptor>* live);
// Return a human readable string that describes this version's contents.
std::string DebugString(bool hex = false) const;
std::string DebugString(bool hex = false, bool print_stats = false) const;
// Returns the version nuber of this version
uint64_t GetVersionNumber() const { return version_number_; }

@ -55,17 +55,21 @@ struct LevelMetaData {
// The metadata that describes a SST file.
struct SstFileMetaData {
SstFileMetaData() {}
SstFileMetaData(const std::string& _file_name,
const std::string& _path, uint64_t _size,
SequenceNumber _smallest_seqno,
SstFileMetaData(const std::string& _file_name, const std::string& _path,
uint64_t _size, SequenceNumber _smallest_seqno,
SequenceNumber _largest_seqno,
const std::string& _smallestkey,
const std::string& _largestkey,
bool _being_compacted) :
size(_size), name(_file_name),
db_path(_path), smallest_seqno(_smallest_seqno), largest_seqno(_largest_seqno),
smallestkey(_smallestkey), largestkey(_largestkey),
being_compacted(_being_compacted) {}
const std::string& _largestkey, uint64_t _num_reads_sampled,
bool _being_compacted)
: size(_size),
name(_file_name),
db_path(_path),
smallest_seqno(_smallest_seqno),
largest_seqno(_largest_seqno),
smallestkey(_smallestkey),
largestkey(_largestkey),
num_reads_sampled(_num_reads_sampled),
being_compacted(_being_compacted) {}
// File size in bytes.
uint64_t size;
@ -78,6 +82,7 @@ struct SstFileMetaData {
SequenceNumber largest_seqno; // Largest sequence number in file.
std::string smallestkey; // Smallest user defined key in the file.
std::string largestkey; // Largest user defined key in the file.
uint64_t num_reads_sampled; // How many times the file is read.
bool being_compacted; // true if the file is currently being compacted.
};
@ -86,7 +91,4 @@ struct LiveFileMetaData : SstFileMetaData {
std::string column_family_name; // Name of the column family
int level; // Level at which this file resides.
};
} // namespace rocksdb

@ -0,0 +1,25 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// This source code is also licensed under the GPLv2 license found in the
// COPYING file in the root directory of this source tree.
//
#pragma once
#include "db/version_edit.h"
#include "util/random.h"
namespace rocksdb {
static const uint32_t kFileReadSampleRate = 1024;
extern bool should_sample_file_read();
extern void sample_file_read_inc(FileMetaData*);
inline bool should_sample_file_read() {
return (Random::GetTLSInstance()->Next() % kFileReadSampleRate == 307);
}
inline void sample_file_read_inc(FileMetaData* meta) {
meta->stats.num_reads_sampled.fetch_add(kFileReadSampleRate,
std::memory_order_relaxed);
}
}

@ -8,6 +8,7 @@
#include "table/get_context.h"
#include "db/merge_helper.h"
#include "db/pinned_iterators_manager.h"
#include "monitoring/file_read_sample.h"
#include "monitoring/perf_context_imp.h"
#include "monitoring/statistics.h"
#include "rocksdb/env.h"
@ -59,6 +60,7 @@ GetContext::GetContext(const Comparator* ucmp,
if (seq_) {
*seq_ = kMaxSequenceNumber;
}
sample_ = should_sample_file_read();
}
// Called from TableCache::Get and Table::Get when file/block in which

@ -62,6 +62,8 @@ class GetContext {
// Do we need to fetch the SequenceNumber for this key?
bool NeedToReadSequence() const { return (seq_ != nullptr); }
bool sample() const { return sample_; }
private:
const Comparator* ucmp_;
const MergeOperator* merge_operator_;
@ -82,6 +84,7 @@ class GetContext {
std::string* replay_log_;
// Used to temporarily pin blocks when state_ == GetContext::kMerge
PinnedIteratorsManager* pinned_iters_mgr_;
bool sample_;
};
void replayGetContextLog(const Slice& replay_log, const Slice& user_key,

Loading…
Cancel
Save