From 40f562e7471d1b1af0d5e45ca6e5c836644a1999 Mon Sep 17 00:00:00 2001 From: sdong Date: Fri, 12 Jun 2015 18:04:30 -0700 Subject: [PATCH] Allow GetApproximateSize() to include mem table size if it is skip list memtable Summary: Add an option in GetApproximateSize() so that the result will include estimated sizes in mem tables. To implement it, implement an estimated count from the beginning to a key in skip list. The approach is to count to find the entry, how many Next() is issued from each level, and sum them with a weight that is ^ . Test Plan: Add a test case Subscribers: leveldb, dhruba Differential Revision: https://reviews.facebook.net/D40119 --- HISTORY.md | 1 + db/db_impl.cc | 19 ++-- db/db_impl.h | 4 +- db/db_test.cc | 110 ++++++++++++++++++++++- db/memtable.cc | 26 +++++- db/memtable.h | 10 ++- db/memtable_list.cc | 9 ++ db/memtable_list.h | 2 + db/skiplist.h | 33 ++++++- include/rocksdb/db.h | 10 ++- include/rocksdb/memtablerep.h | 5 ++ include/rocksdb/utilities/stackable_db.h | 4 +- util/skiplistrep.cc | 9 ++ 13 files changed, 216 insertions(+), 26 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 0a0b60891..033c8f1fb 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -15,6 +15,7 @@ * WBWIIterator::Entry() now returns WriteEntry instead of `const WriteEntry&` * options.hard_rate_limit is deprecated. * When options.soft_rate_limit or options.level0_slowdown_writes_trigger is triggered, the way to slow down writes is changed to: write rate to DB is limited to to options.delayed_write_rate. +* DB::GetApproximateSizes() adds a parameter to allow the estimation to include data in mem table, with default to be not to include. It is now only supported in skip list mem table. ## 3.11.0 (5/19/2015) ### New Features diff --git a/db/db_impl.cc b/db/db_impl.cc index c1677eb9c..aca964f84 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -3881,27 +3881,26 @@ ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) { } void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, - const Range* range, int n, uint64_t* sizes) { + const Range* range, int n, uint64_t* sizes, + bool include_memtable) { Version* v; auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); - { - InstrumentedMutexLock l(&mutex_); - v = cfd->current(); - v->Ref(); - } + SuperVersion* sv = GetAndRefSuperVersion(cfd); + v = sv->current; for (int i = 0; i < n; i++) { // Convert user_key into a corresponding internal key. InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); sizes[i] = versions_->ApproximateSize(v, k1.Encode(), k2.Encode()); + if (include_memtable) { + sizes[i] += sv->mem->ApproximateSize(k1.Encode(), k2.Encode()); + sizes[i] += sv->imm->ApproximateSize(k1.Encode(), k2.Encode()); + } } - { - InstrumentedMutexLock l(&mutex_); - v->Unref(); - } + ReturnAndCleanupSuperVersion(cfd, sv); } std::list::iterator diff --git a/db/db_impl.h b/db/db_impl.h index 9a42da247..89afda987 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -122,8 +122,8 @@ class DBImpl : public DB { const Slice& property, uint64_t* value) override; using DB::GetApproximateSizes; virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, - const Range* range, int n, - uint64_t* sizes) override; + const Range* range, int n, uint64_t* sizes, + bool include_memtable = false) override; using DB::CompactRange; virtual Status CompactRange(ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end, diff --git a/db/db_test.cc b/db/db_test.cc index a8a6c4bdc..a86e755b2 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -6577,6 +6577,112 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) { return result; } +TEST_F(DBTest, ApproximateSizesMemTable) { + Options options; + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + options.create_if_missing = true; + options = CurrentOptions(options); + DestroyAndReopen(options); + + const int N = 128; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } + + uint64_t size; + std::string start = Key(50); + std::string end = Key(60); + Range r(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_GT(size, 6000); + ASSERT_LT(size, 204800); + // Zero if not including mem table + db_->GetApproximateSizes(&r, 1, &size, false); + ASSERT_EQ(size, 0); + + start = Key(500); + end = Key(600); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_EQ(size, 0); + + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024))); + } + + start = Key(500); + end = Key(600); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_EQ(size, 0); + + start = Key(100); + end = Key(1020); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_GT(size, 6000); + + options.max_write_buffer_number = 8; + options.min_write_buffer_number_to_merge = 5; + options.write_buffer_size = 1024 * N; // Not very large + DestroyAndReopen(options); + + int keys[N * 3]; + for (int i = 0; i < N; i++) { + keys[i * 3] = i * 5; + keys[i * 3 + 1] = i * 5 + 1; + keys[i * 3 + 2] = i * 5 + 2; + } + std::random_shuffle(std::begin(keys), std::end(keys)); + + for (int i = 0; i < N * 3; i++) { + ASSERT_OK(Put(Key(keys[i] + 1000), RandomString(&rnd, 1024))); + } + + start = Key(100); + end = Key(300); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_EQ(size, 0); + + start = Key(1050); + end = Key(1080); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_GT(size, 6000); + + start = Key(2100); + end = Key(2300); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_EQ(size, 0); + + start = Key(1050); + end = Key(1080); + r = Range(start, end); + uint64_t size_with_mt, size_without_mt; + db_->GetApproximateSizes(&r, 1, &size_with_mt, true); + ASSERT_GT(size_with_mt, 6000); + db_->GetApproximateSizes(&r, 1, &size_without_mt, false); + ASSERT_EQ(size_without_mt, 0); + + Flush(); + + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i + 1000), RandomString(&rnd, 1024))); + } + + start = Key(1050); + end = Key(1080); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size_with_mt, true); + db_->GetApproximateSizes(&r, 1, &size_without_mt, false); + ASSERT_GT(size_with_mt, size_without_mt); + ASSERT_GT(size_without_mt, 6000); +} + TEST_F(DBTest, ApproximateSizes) { do { Options options; @@ -8948,8 +9054,8 @@ class ModelDB: public DB { } using DB::GetApproximateSizes; virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, - const Range* range, int n, - uint64_t* sizes) override { + const Range* range, int n, uint64_t* sizes, + bool include_memtable) override { for (int i = 0; i < n; i++) { sizes[i] = 0; } diff --git a/db/memtable.cc b/db/memtable.cc index a98dc7b78..c447cbbb1 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -64,6 +64,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, table_(ioptions.memtable_factory->CreateMemTableRep( comparator_, &allocator_, ioptions.prefix_extractor, ioptions.info_log)), + data_size_(0), num_entries_(0), num_deletes_(0), flush_in_progress_(false), @@ -290,6 +291,26 @@ port::RWMutex* MemTable::GetLock(const Slice& key) { return &locks_[hash(key) % locks_.size()]; } +uint64_t MemTable::ApproximateSize(const Slice& start_ikey, + const Slice& end_ikey) { + uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey); + if (entry_count == 0) { + return 0; + } + uint64_t n = num_entries_.load(std::memory_order_relaxed); + if (n == 0) { + return 0; + } + if (entry_count > n) { + // table_->ApproximateNumEntries() is just an estimate so it can be larger + // than actual entries we have. Cap it to entries we have to limit the + // inaccuracy. + entry_count = n; + } + uint64_t data_size = data_size_.load(std::memory_order_relaxed); + return entry_count * (data_size / n); +} + void MemTable::Add(SequenceNumber s, ValueType type, const Slice& key, /* user key */ const Slice& value) { @@ -317,7 +338,10 @@ void MemTable::Add(SequenceNumber s, ValueType type, memcpy(p, value.data(), val_size); assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); table_->Insert(handle); - num_entries_++; + num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1, + std::memory_order_relaxed); + data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len, + std::memory_order_relaxed); if (type == kTypeDeletion) { num_deletes_++; } diff --git a/db/memtable.h b/db/memtable.h index 663d6e656..f09082ce0 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -212,7 +212,9 @@ class MemTable { // Get total number of entries in the mem table. // REQUIRES: external synchronization to prevent simultaneous // operations on the same MemTable (unless this Memtable is immutable). - uint64_t num_entries() const { return num_entries_; } + uint64_t num_entries() const { + return num_entries_.load(std::memory_order_relaxed); + } // Get total number of deletes in the mem table. // REQUIRES: external synchronization to prevent simultaneous @@ -275,6 +277,8 @@ class MemTable { return table_->IsSnapshotSupported() && !moptions_.inplace_update_support; } + uint64_t ApproximateSize(const Slice& start_ikey, const Slice& end_ikey); + // Get the lock associated for the key port::RWMutex* GetLock(const Slice& key); @@ -300,7 +304,9 @@ class MemTable { MemTableAllocator allocator_; unique_ptr table_; - uint64_t num_entries_; + // Total data size of all data inserted + std::atomic data_size_; + std::atomic num_entries_; uint64_t num_deletes_; // These are used to manage memtable flushes to storage diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 16cc13e25..f74f1b377 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -149,6 +149,15 @@ uint64_t MemTableListVersion::GetTotalNumEntries() const { return total_num; } +uint64_t MemTableListVersion::ApproximateSize(const Slice& start_ikey, + const Slice& end_ikey) { + uint64_t total_size = 0; + for (auto& m : memlist_) { + total_size += m->ApproximateSize(start_ikey, end_ikey); + } + return total_size; +} + uint64_t MemTableListVersion::GetTotalNumDeletes() const { uint64_t total_num = 0; for (auto& m : memlist_) { diff --git a/db/memtable_list.h b/db/memtable_list.h index 2be4ab00d..3d19290fd 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -84,6 +84,8 @@ class MemTableListVersion { uint64_t GetTotalNumDeletes() const; + uint64_t ApproximateSize(const Slice& start_ikey, const Slice& end_ikey); + // Returns the value of MemTable::GetEarliestSequenceNumber() on the most // recent MemTable in this list or kMaxSequenceNumber if the list is empty. // If include_history=true, will also search Memtables in MemTableList diff --git a/db/skiplist.h b/db/skiplist.h index c1e375007..c61c1947e 100644 --- a/db/skiplist.h +++ b/db/skiplist.h @@ -59,6 +59,9 @@ class SkipList { // Returns true iff an entry that compares equal to key is in the list. bool Contains(const Key& key) const; + // Return estimated number of entries smaller than `key`. + uint64_t EstimateCount(const Key& key) const; + // Iteration over the contents of a skip list class Iterator { public: @@ -354,10 +357,34 @@ typename SkipList::Node* SkipList::FindLast() } } -template +template +uint64_t SkipList::EstimateCount(const Key& key) const { + uint64_t count = 0; + + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + assert(x == head_ || compare_(x->key, key) < 0); + Node* next = x->Next(level); + if (next == nullptr || compare_(next->key, key) >= 0) { + if (level == 0) { + return count; + } else { + // Switch to next list + count *= kBranching_; + level--; + } + } else { + x = next; + count++; + } + } +} + +template SkipList::SkipList(const Comparator cmp, Allocator* allocator, - int32_t max_height, - int32_t branching_factor) + int32_t max_height, + int32_t branching_factor) : kMaxHeight_(max_height), kBranching_(branching_factor), compare_(cmp), diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index d25c421dc..54536e999 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -397,10 +397,12 @@ class DB { // // The results may not include the sizes of recently written data. virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, - const Range* range, int n, - uint64_t* sizes) = 0; - virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) { - GetApproximateSizes(DefaultColumnFamily(), range, n, sizes); + const Range* range, int n, uint64_t* sizes, + bool include_memtable = false) = 0; + virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes, + bool include_memtable = false) { + GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, + include_memtable); } // Compact the underlying storage for the key range [*begin,*end]. diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index c369e888a..f02c2d094 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -103,6 +103,11 @@ class MemTableRep { virtual void Get(const LookupKey& k, void* callback_args, bool (*callback_func)(void* arg, const char* entry)); + virtual uint64_t ApproximateNumEntries(const Slice& start_ikey, + const Slice& end_key) { + return 0; + } + // Report an approximation of how much memory has been used other than memory // that was allocated through the allocator. virtual size_t ApproximateMemoryUsage() = 0; diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index 2a7f8e64a..6231b339b 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -121,8 +121,8 @@ class StackableDB : public DB { using DB::GetApproximateSizes; virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, - const Range* r, int n, - uint64_t* sizes) override { + const Range* r, int n, uint64_t* sizes, + bool include_memtable = false) override { return db_->GetApproximateSizes(column_family, r, n, sizes); } diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc index ee57372fa..112a7ab12 100644 --- a/util/skiplistrep.cc +++ b/util/skiplistrep.cc @@ -52,6 +52,15 @@ public: } } + uint64_t ApproximateNumEntries(const Slice& start_ikey, + const Slice& end_ikey) override { + std::string tmp; + uint64_t start_count = + skip_list_.EstimateCount(EncodeKey(&tmp, start_ikey)); + uint64_t end_count = skip_list_.EstimateCount(EncodeKey(&tmp, end_ikey)); + return (end_count >= start_count) ? (end_count - start_count) : 0; + } + virtual ~SkipListRep() override { } // Iteration over the contents of a skip list