diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index dfff9fc43..0ea1d7d92 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -557,9 +557,9 @@ TEST_F(DBPropertiesTest, NumImmutableMemTable) { ASSERT_EQ(num, "3"); ASSERT_TRUE(dbfull()->GetProperty( handles_[1], "rocksdb.cur-size-active-mem-table", &num)); - // "192" is the size of the metadata of an empty skiplist, this would - // break if we change the default skiplist implementation - ASSERT_EQ(num, "192"); + // "384" is the size of the metadata of two empty skiplists, this would + // break if we change the default vectorrep/skiplist implementation + ASSERT_EQ(num, "384"); uint64_t int_num; uint64_t base_total_size; diff --git a/db/memtable.cc b/db/memtable.cc index a25c05a3c..b3f0b65e1 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -68,6 +68,9 @@ MemTable::MemTable(const InternalKeyComparator& cmp, table_(ioptions.memtable_factory->CreateMemTableRep( comparator_, &allocator_, ioptions.prefix_extractor, ioptions.info_log)), + range_del_table_(ioptions.memtable_factory->CreateMemTableRep( + comparator_, &allocator_, nullptr /* transform */, + ioptions.info_log)), data_size_(0), num_entries_(0), num_deletes_(0), @@ -101,6 +104,7 @@ MemTable::~MemTable() { assert(refs_ == 0); } size_t MemTable::ApproximateMemoryUsage() { size_t arena_usage = arena_.ApproximateMemoryUsage(); size_t table_usage = table_->ApproximateMemoryUsage(); + table_usage += range_del_table_->ApproximateMemoryUsage(); // let MAX_USAGE = std::numeric_limits::max() // then if arena_usage + total_usage >= MAX_USAGE, return MAX_USAGE. // the following variation is to avoid numeric overflow. @@ -122,8 +126,9 @@ bool MemTable::ShouldFlushNow() const { // If arena still have room for new block allocation, we can safely say it // shouldn't flush. - auto allocated_memory = - table_->ApproximateMemoryUsage() + arena_.MemoryAllocatedBytes(); + auto allocated_memory = table_->ApproximateMemoryUsage() + + range_del_table_->ApproximateMemoryUsage() + + arena_.MemoryAllocatedBytes(); // if we can still allocate one more block without exceeding the // over-allocation ratio, then we should not flush. @@ -219,14 +224,16 @@ const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator : public InternalIterator { public: MemTableIterator(const MemTable& mem, const ReadOptions& read_options, - Arena* arena) + Arena* arena, bool use_range_del_table = false) : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), comparator_(mem.comparator_), valid_(false), arena_mode_(arena != nullptr), value_pinned_(!mem.GetMemTableOptions()->inplace_update_support) { - if (prefix_extractor_ != nullptr && !read_options.total_order_seek) { + if (use_range_del_table) { + iter_ = mem.range_del_table_->GetIterator(arena); + } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) { bloom_ = mem.prefix_bloom_.get(); iter_ = mem.table_->GetDynamicPrefixIterator(arena); } else { @@ -356,6 +363,14 @@ InternalIterator* MemTable::NewIterator(const ReadOptions& read_options, return new (mem) MemTableIterator(*this, read_options, arena); } +InternalIterator* MemTable::NewRangeTombstoneIterator( + const ReadOptions& read_options, Arena* arena) { + assert(arena != nullptr); + auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); + return new (mem) MemTableIterator(*this, read_options, arena, + true /* use_range_del_table */); +} + port::RWMutex* MemTable::GetLock(const Slice& key) { static murmur_hash hash; return &locks_[hash(key) % locks_.size()]; @@ -364,6 +379,7 @@ port::RWMutex* MemTable::GetLock(const Slice& key) { uint64_t MemTable::ApproximateSize(const Slice& start_ikey, const Slice& end_ikey) { uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey); + entry_count += range_del_table_->ApproximateNumEntries(start_ikey, end_ikey); if (entry_count == 0) { return 0; } @@ -372,9 +388,9 @@ uint64_t MemTable::ApproximateSize(const Slice& start_ikey, return 0; } if (entry_count > n) { - // table_->ApproximateNumEntries() is just an estimate so it can be larger - // than actual entries we have. Cap it to entries we have to limit the - // inaccuracy. + // (range_del_)table_->ApproximateNumEntries() is just an estimate so it can + // be larger than actual entries we have. Cap it to entries we have to limit + // the inaccuracy. entry_count = n; } uint64_t data_size = data_size_.load(std::memory_order_relaxed); @@ -397,7 +413,9 @@ void MemTable::Add(SequenceNumber s, ValueType type, internal_key_size + VarintLength(val_size) + val_size; char* buf = nullptr; - KeyHandle handle = table_->Allocate(encoded_len, &buf); + std::unique_ptr& table = + type == kTypeRangeDeletion ? range_del_table_ : table_; + KeyHandle handle = table->Allocate(encoded_len, &buf); char* p = EncodeVarint32(buf, internal_key_size); memcpy(p, key.data(), key_size); @@ -409,7 +427,7 @@ void MemTable::Add(SequenceNumber s, ValueType type, memcpy(p, value.data(), val_size); assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); if (!allow_concurrent) { - table_->Insert(handle); + table->Insert(handle); // this is a bit ugly, but is the way to avoid locked instructions // when incrementing an atomic @@ -441,7 +459,7 @@ void MemTable::Add(SequenceNumber s, ValueType type, assert(post_process_info == nullptr); UpdateFlushState(); } else { - table_->InsertConcurrently(handle); + table->InsertConcurrently(handle); assert(post_process_info != nullptr); post_process_info->num_entries++; diff --git a/db/memtable.h b/db/memtable.h index e9e370a41..652a81711 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -158,6 +158,9 @@ class MemTable { // those allocated in arena. InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena); + InternalIterator* NewRangeTombstoneIterator(const ReadOptions& read_options, + Arena* arena); + // Add an entry into memtable that maps key to value at the // specified sequence number and with the specified type. // Typically value will be empty if type==kTypeDeletion. @@ -344,6 +347,7 @@ class MemTable { ConcurrentArena arena_; MemTableAllocator allocator_; unique_ptr table_; + unique_ptr range_del_table_; // Total data size of all data inserted std::atomic data_size_; diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 275b5671e..05e7c0e7b 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -43,60 +43,65 @@ static std::string PrintContents(WriteBatch* b) { int single_delete_count = 0; int delete_range_count = 0; int merge_count = 0; - Arena arena; - ScopedArenaIterator iter(mem->NewIterator(ReadOptions(), &arena)); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ParsedInternalKey ikey; - memset((void *)&ikey, 0, sizeof(ikey)); - EXPECT_TRUE(ParseInternalKey(iter->key(), &ikey)); - switch (ikey.type) { - case kTypeValue: - state.append("Put("); - state.append(ikey.user_key.ToString()); - state.append(", "); - state.append(iter->value().ToString()); - state.append(")"); - count++; - put_count++; - break; - case kTypeDeletion: - state.append("Delete("); - state.append(ikey.user_key.ToString()); - state.append(")"); - count++; - delete_count++; - break; - case kTypeSingleDeletion: - state.append("SingleDelete("); - state.append(ikey.user_key.ToString()); - state.append(")"); - count++; - single_delete_count++; - break; - case kTypeRangeDeletion: - state.append("DeleteRange("); - state.append(ikey.user_key.ToString()); - state.append(", "); - state.append(iter->value().ToString()); - state.append(")"); - count++; - delete_range_count++; - break; - case kTypeMerge: - state.append("Merge("); - state.append(ikey.user_key.ToString()); - state.append(", "); - state.append(iter->value().ToString()); - state.append(")"); - count++; - merge_count++; - break; - default: - assert(false); - break; + for (int i = 0; i < 2; ++i) { + Arena arena; + auto iter = + i == 0 ? ScopedArenaIterator(mem->NewIterator(ReadOptions(), &arena)) + : ScopedArenaIterator( + mem->NewRangeTombstoneIterator(ReadOptions(), &arena)); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + memset((void*)&ikey, 0, sizeof(ikey)); + EXPECT_TRUE(ParseInternalKey(iter->key(), &ikey)); + switch (ikey.type) { + case kTypeValue: + state.append("Put("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + put_count++; + break; + case kTypeDeletion: + state.append("Delete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + delete_count++; + break; + case kTypeSingleDeletion: + state.append("SingleDelete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + single_delete_count++; + break; + case kTypeRangeDeletion: + state.append("DeleteRange("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + delete_range_count++; + break; + case kTypeMerge: + state.append("Merge("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + merge_count++; + break; + default: + assert(false); + break; + } + state.append("@"); + state.append(NumberToString(ikey.sequence)); } - state.append("@"); - state.append(NumberToString(ikey.sequence)); } EXPECT_EQ(b->HasPut(), put_count > 0); EXPECT_EQ(b->HasDelete(), delete_count > 0); @@ -131,10 +136,10 @@ TEST_F(WriteBatchTest, Multiple) { ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch)); ASSERT_EQ(4, WriteBatchInternal::Count(&batch)); ASSERT_EQ( - "DeleteRange(bar, foo)@102" "Put(baz, boo)@103" "Delete(box)@101" - "Put(foo, bar)@100", + "Put(foo, bar)@100" + "DeleteRange(bar, foo)@102", PrintContents(&batch)); ASSERT_EQ(4, batch.Count()); } diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index f0779f9ca..724d69f17 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -623,6 +623,11 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { // TODO(wanning&andrewkr) add num_tomestone to table properties r->range_del_block.Add(key, value); ++r->props.num_entries; + r->props.raw_key_size += key.size(); + r->props.raw_value_size += value.size(); + NotifyCollectTableCollectorsOnAdd(key, value, r->offset, + r->table_properties_collectors, + r->ioptions.info_log); } else { assert(false); } diff --git a/table/table_test.cc b/table/table_test.cc index 5e2adb7f2..96c568970 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -2432,18 +2432,25 @@ TEST_F(MemTableTest, Simple) { batch.Put(std::string("k2"), std::string("v2")); batch.Put(std::string("k3"), std::string("v3")); batch.Put(std::string("largekey"), std::string("vlarge")); + batch.DeleteRange(std::string("chi"), std::string("xigua")); + batch.DeleteRange(std::string("begin"), std::string("end")); ColumnFamilyMemTablesDefault cf_mems_default(memtable); ASSERT_TRUE( WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr).ok()); - Arena arena; - ScopedArenaIterator iter(memtable->NewIterator(ReadOptions(), &arena)); - iter->SeekToFirst(); - while (iter->Valid()) { - fprintf(stderr, "key: '%s' -> '%s'\n", - iter->key().ToString().c_str(), - iter->value().ToString().c_str()); - iter->Next(); + for (int i = 0; i < 2; ++i) { + Arena arena; + ScopedArenaIterator iter = + i == 0 + ? ScopedArenaIterator(memtable->NewIterator(ReadOptions(), &arena)) + : ScopedArenaIterator( + memtable->NewRangeTombstoneIterator(ReadOptions(), &arena)); + iter->SeekToFirst(); + while (iter->Valid()) { + fprintf(stderr, "key: '%s' -> '%s'\n", iter->key().ToString().c_str(), + iter->value().ToString().c_str()); + iter->Next(); + } } delete memtable->Unref();