Allow GetApproximateSize() to include mem table size if it is skip list memtable

Summary:
Add an option in GetApproximateSize() so that the result will include estimated sizes in mem tables.
To implement it, implement an estimated count from the beginning to a key in skip list. The approach is to count to find the entry, how many Next() is issued from each level, and sum them with a weight that is <branching factor> ^ <level>.

Test Plan: Add a test case

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D40119
main
sdong 10 years ago
parent d59d90bb1f
commit 40f562e747
  1. 1
      HISTORY.md
  2. 19
      db/db_impl.cc
  3. 4
      db/db_impl.h
  4. 110
      db/db_test.cc
  5. 26
      db/memtable.cc
  6. 10
      db/memtable.h
  7. 9
      db/memtable_list.cc
  8. 2
      db/memtable_list.h
  9. 33
      db/skiplist.h
  10. 10
      include/rocksdb/db.h
  11. 5
      include/rocksdb/memtablerep.h
  12. 4
      include/rocksdb/utilities/stackable_db.h
  13. 9
      util/skiplistrep.cc

@ -15,6 +15,7 @@
* WBWIIterator::Entry() now returns WriteEntry instead of `const WriteEntry&`
* options.hard_rate_limit is deprecated.
* When options.soft_rate_limit or options.level0_slowdown_writes_trigger is triggered, the way to slow down writes is changed to: write rate to DB is limited to to options.delayed_write_rate.
* DB::GetApproximateSizes() adds a parameter to allow the estimation to include data in mem table, with default to be not to include. It is now only supported in skip list mem table.
## 3.11.0 (5/19/2015)
### New Features

@ -3881,27 +3881,26 @@ ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
}
void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
const Range* range, int n, uint64_t* sizes) {
const Range* range, int n, uint64_t* sizes,
bool include_memtable) {
Version* v;
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
{
InstrumentedMutexLock l(&mutex_);
v = cfd->current();
v->Ref();
}
SuperVersion* sv = GetAndRefSuperVersion(cfd);
v = sv->current;
for (int i = 0; i < n; i++) {
// Convert user_key into a corresponding internal key.
InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
sizes[i] = versions_->ApproximateSize(v, k1.Encode(), k2.Encode());
if (include_memtable) {
sizes[i] += sv->mem->ApproximateSize(k1.Encode(), k2.Encode());
sizes[i] += sv->imm->ApproximateSize(k1.Encode(), k2.Encode());
}
}
{
InstrumentedMutexLock l(&mutex_);
v->Unref();
}
ReturnAndCleanupSuperVersion(cfd, sv);
}
std::list<uint64_t>::iterator

@ -122,8 +122,8 @@ class DBImpl : public DB {
const Slice& property, uint64_t* value) override;
using DB::GetApproximateSizes;
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
const Range* range, int n,
uint64_t* sizes) override;
const Range* range, int n, uint64_t* sizes,
bool include_memtable = false) override;
using DB::CompactRange;
virtual Status CompactRange(ColumnFamilyHandle* column_family,
const Slice* begin, const Slice* end,

@ -6577,6 +6577,112 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) {
return result;
}
TEST_F(DBTest, ApproximateSizesMemTable) {
Options options;
options.write_buffer_size = 100000000; // Large write buffer
options.compression = kNoCompression;
options.create_if_missing = true;
options = CurrentOptions(options);
DestroyAndReopen(options);
const int N = 128;
Random rnd(301);
for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
}
uint64_t size;
std::string start = Key(50);
std::string end = Key(60);
Range r(start, end);
db_->GetApproximateSizes(&r, 1, &size, true);
ASSERT_GT(size, 6000);
ASSERT_LT(size, 204800);
// Zero if not including mem table
db_->GetApproximateSizes(&r, 1, &size, false);
ASSERT_EQ(size, 0);
start = Key(500);
end = Key(600);
r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size, true);
ASSERT_EQ(size, 0);
for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024)));
}
start = Key(500);
end = Key(600);
r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size, true);
ASSERT_EQ(size, 0);
start = Key(100);
end = Key(1020);
r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size, true);
ASSERT_GT(size, 6000);
options.max_write_buffer_number = 8;
options.min_write_buffer_number_to_merge = 5;
options.write_buffer_size = 1024 * N; // Not very large
DestroyAndReopen(options);
int keys[N * 3];
for (int i = 0; i < N; i++) {
keys[i * 3] = i * 5;
keys[i * 3 + 1] = i * 5 + 1;
keys[i * 3 + 2] = i * 5 + 2;
}
std::random_shuffle(std::begin(keys), std::end(keys));
for (int i = 0; i < N * 3; i++) {
ASSERT_OK(Put(Key(keys[i] + 1000), RandomString(&rnd, 1024)));
}
start = Key(100);
end = Key(300);
r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size, true);
ASSERT_EQ(size, 0);
start = Key(1050);
end = Key(1080);
r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size, true);
ASSERT_GT(size, 6000);
start = Key(2100);
end = Key(2300);
r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size, true);
ASSERT_EQ(size, 0);
start = Key(1050);
end = Key(1080);
r = Range(start, end);
uint64_t size_with_mt, size_without_mt;
db_->GetApproximateSizes(&r, 1, &size_with_mt, true);
ASSERT_GT(size_with_mt, 6000);
db_->GetApproximateSizes(&r, 1, &size_without_mt, false);
ASSERT_EQ(size_without_mt, 0);
Flush();
for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(i + 1000), RandomString(&rnd, 1024)));
}
start = Key(1050);
end = Key(1080);
r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size_with_mt, true);
db_->GetApproximateSizes(&r, 1, &size_without_mt, false);
ASSERT_GT(size_with_mt, size_without_mt);
ASSERT_GT(size_without_mt, 6000);
}
TEST_F(DBTest, ApproximateSizes) {
do {
Options options;
@ -8948,8 +9054,8 @@ class ModelDB: public DB {
}
using DB::GetApproximateSizes;
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
const Range* range, int n,
uint64_t* sizes) override {
const Range* range, int n, uint64_t* sizes,
bool include_memtable) override {
for (int i = 0; i < n; i++) {
sizes[i] = 0;
}

@ -64,6 +64,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
table_(ioptions.memtable_factory->CreateMemTableRep(
comparator_, &allocator_, ioptions.prefix_extractor,
ioptions.info_log)),
data_size_(0),
num_entries_(0),
num_deletes_(0),
flush_in_progress_(false),
@ -290,6 +291,26 @@ port::RWMutex* MemTable::GetLock(const Slice& key) {
return &locks_[hash(key) % locks_.size()];
}
uint64_t MemTable::ApproximateSize(const Slice& start_ikey,
const Slice& end_ikey) {
uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey);
if (entry_count == 0) {
return 0;
}
uint64_t n = num_entries_.load(std::memory_order_relaxed);
if (n == 0) {
return 0;
}
if (entry_count > n) {
// table_->ApproximateNumEntries() is just an estimate so it can be larger
// than actual entries we have. Cap it to entries we have to limit the
// inaccuracy.
entry_count = n;
}
uint64_t data_size = data_size_.load(std::memory_order_relaxed);
return entry_count * (data_size / n);
}
void MemTable::Add(SequenceNumber s, ValueType type,
const Slice& key, /* user key */
const Slice& value) {
@ -317,7 +338,10 @@ void MemTable::Add(SequenceNumber s, ValueType type,
memcpy(p, value.data(), val_size);
assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
table_->Insert(handle);
num_entries_++;
num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
std::memory_order_relaxed);
data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
std::memory_order_relaxed);
if (type == kTypeDeletion) {
num_deletes_++;
}

@ -212,7 +212,9 @@ class MemTable {
// Get total number of entries in the mem table.
// REQUIRES: external synchronization to prevent simultaneous
// operations on the same MemTable (unless this Memtable is immutable).
uint64_t num_entries() const { return num_entries_; }
uint64_t num_entries() const {
return num_entries_.load(std::memory_order_relaxed);
}
// Get total number of deletes in the mem table.
// REQUIRES: external synchronization to prevent simultaneous
@ -275,6 +277,8 @@ class MemTable {
return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
}
uint64_t ApproximateSize(const Slice& start_ikey, const Slice& end_ikey);
// Get the lock associated for the key
port::RWMutex* GetLock(const Slice& key);
@ -300,7 +304,9 @@ class MemTable {
MemTableAllocator allocator_;
unique_ptr<MemTableRep> table_;
uint64_t num_entries_;
// Total data size of all data inserted
std::atomic<uint64_t> data_size_;
std::atomic<uint64_t> num_entries_;
uint64_t num_deletes_;
// These are used to manage memtable flushes to storage

@ -149,6 +149,15 @@ uint64_t MemTableListVersion::GetTotalNumEntries() const {
return total_num;
}
uint64_t MemTableListVersion::ApproximateSize(const Slice& start_ikey,
const Slice& end_ikey) {
uint64_t total_size = 0;
for (auto& m : memlist_) {
total_size += m->ApproximateSize(start_ikey, end_ikey);
}
return total_size;
}
uint64_t MemTableListVersion::GetTotalNumDeletes() const {
uint64_t total_num = 0;
for (auto& m : memlist_) {

@ -84,6 +84,8 @@ class MemTableListVersion {
uint64_t GetTotalNumDeletes() const;
uint64_t ApproximateSize(const Slice& start_ikey, const Slice& end_ikey);
// Returns the value of MemTable::GetEarliestSequenceNumber() on the most
// recent MemTable in this list or kMaxSequenceNumber if the list is empty.
// If include_history=true, will also search Memtables in MemTableList

@ -59,6 +59,9 @@ class SkipList {
// Returns true iff an entry that compares equal to key is in the list.
bool Contains(const Key& key) const;
// Return estimated number of entries smaller than `key`.
uint64_t EstimateCount(const Key& key) const;
// Iteration over the contents of a skip list
class Iterator {
public:
@ -354,10 +357,34 @@ typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
}
}
template<typename Key, class Comparator>
template <typename Key, class Comparator>
uint64_t SkipList<Key, Comparator>::EstimateCount(const Key& key) const {
uint64_t count = 0;
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
assert(x == head_ || compare_(x->key, key) < 0);
Node* next = x->Next(level);
if (next == nullptr || compare_(next->key, key) >= 0) {
if (level == 0) {
return count;
} else {
// Switch to next list
count *= kBranching_;
level--;
}
} else {
x = next;
count++;
}
}
}
template <typename Key, class Comparator>
SkipList<Key, Comparator>::SkipList(const Comparator cmp, Allocator* allocator,
int32_t max_height,
int32_t branching_factor)
int32_t max_height,
int32_t branching_factor)
: kMaxHeight_(max_height),
kBranching_(branching_factor),
compare_(cmp),

@ -397,10 +397,12 @@ class DB {
//
// The results may not include the sizes of recently written data.
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
const Range* range, int n,
uint64_t* sizes) = 0;
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) {
GetApproximateSizes(DefaultColumnFamily(), range, n, sizes);
const Range* range, int n, uint64_t* sizes,
bool include_memtable = false) = 0;
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes,
bool include_memtable = false) {
GetApproximateSizes(DefaultColumnFamily(), range, n, sizes,
include_memtable);
}
// Compact the underlying storage for the key range [*begin,*end].

@ -103,6 +103,11 @@ class MemTableRep {
virtual void Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry));
virtual uint64_t ApproximateNumEntries(const Slice& start_ikey,
const Slice& end_key) {
return 0;
}
// Report an approximation of how much memory has been used other than memory
// that was allocated through the allocator.
virtual size_t ApproximateMemoryUsage() = 0;

@ -121,8 +121,8 @@ class StackableDB : public DB {
using DB::GetApproximateSizes;
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
const Range* r, int n,
uint64_t* sizes) override {
const Range* r, int n, uint64_t* sizes,
bool include_memtable = false) override {
return db_->GetApproximateSizes(column_family, r, n, sizes);
}

@ -52,6 +52,15 @@ public:
}
}
uint64_t ApproximateNumEntries(const Slice& start_ikey,
const Slice& end_ikey) override {
std::string tmp;
uint64_t start_count =
skip_list_.EstimateCount(EncodeKey(&tmp, start_ikey));
uint64_t end_count = skip_list_.EstimateCount(EncodeKey(&tmp, end_ikey));
return (end_count >= start_count) ? (end_count - start_count) : 0;
}
virtual ~SkipListRep() override { }
// Iteration over the contents of a skip list

Loading…
Cancel
Save