From 734e4acafbcea77934d5164c424b5084f0422c82 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Mon, 21 Nov 2016 21:08:06 -0800 Subject: [PATCH] Eliminate redundant cache lookup with range deletion Summary: When we introduced range deletion block, TableCache::Get() and TableCache::NewIterator() each did two table cache lookups, one for range deletion block iterator and another for getting the table reader to which the Get()/NewIterator() is delegated. This extra cache lookup was very CPU-intensive (about 10% overhead in a read-heavy benchmark). We can avoid it by reusing the Cache::Handle created for range deletion block iterator to get the file reader. Closes https://github.com/facebook/rocksdb/pull/1537 Differential Revision: D4201167 Pulled By: ajkr fbshipit-source-id: d33ffd8 --- db/db_compaction_test.cc | 21 +++-- db/table_cache.cc | 167 ++++++++++++++++----------------------- db/table_cache.h | 10 --- 3 files changed, 77 insertions(+), 121 deletions(-) diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 0c2492216..34edc5387 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -294,8 +294,7 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { num_new_table_reader = 0; ASSERT_EQ(Key(k), Get(Key(k))); // lookup iterator from table cache and no need to create a new one. - // a second table cache iterator is created for range tombstones - ASSERT_EQ(num_table_cache_lookup, 2); + ASSERT_EQ(num_table_cache_lookup, 1); ASSERT_EQ(num_new_table_reader, 0); } } @@ -306,9 +305,9 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { dbfull()->TEST_WaitForCompact(); // Preloading iterator issues one table cache lookup and creates // a new table reader. One file is created for flush and one for compaction. - // Compaction inputs make no table cache look-up for data iterators and one - // look-up per compaction input file (three). - ASSERT_EQ(num_table_cache_lookup, 5); + // Compaction inputs make no table cache look-up for data/range deletion + // iterators + ASSERT_EQ(num_table_cache_lookup, 2); // Create new iterator for: // (1) 1 for verifying flush results // (2) 3 for compaction input files @@ -318,8 +317,7 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { num_table_cache_lookup = 0; num_new_table_reader = 0; ASSERT_EQ(Key(1), Get(Key(1))); - // a second table cache iterator is created for range tombstones - ASSERT_EQ(num_table_cache_lookup, 2); + ASSERT_EQ(num_table_cache_lookup, 1); ASSERT_EQ(num_new_table_reader, 0); num_table_cache_lookup = 0; @@ -329,17 +327,16 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { cro.target_level = 2; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; db_->CompactRange(cro, nullptr, nullptr); - // Only verifying compaction outputs issues two table cache lookup - // (one for data block, one for range deletion block). - ASSERT_EQ(num_table_cache_lookup, 2); + // Only verifying compaction outputs issues one table cache lookup + // for both data block and range deletion block). + ASSERT_EQ(num_table_cache_lookup, 1); // One for compaction input, one for verifying compaction results. ASSERT_EQ(num_new_table_reader, 2); num_table_cache_lookup = 0; num_new_table_reader = 0; ASSERT_EQ(Key(1), Get(Key(1))); - // a second table cache iterator is created for range tombstones - ASSERT_EQ(num_table_cache_lookup, 2); + ASSERT_EQ(num_table_cache_lookup, 1); ASSERT_EQ(num_new_table_reader, 0); rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); diff --git a/db/table_cache.cc b/db/table_cache.cc index 6688dd124..908bbf943 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -175,17 +175,6 @@ InternalIterator* TableCache::NewIterator( PERF_TIMER_GUARD(new_table_iterator_nanos); Status s; - if (range_del_agg != nullptr && !options.ignore_range_deletions) { - std::unique_ptr range_del_iter(NewRangeDeletionIterator( - options, icomparator, fd, file_read_hist, skip_filters, level)); - if (range_del_iter != nullptr) { - s = range_del_iter->status(); - } - if (s.ok()) { - s = range_del_agg->AddTombstones(std::move(range_del_iter)); - } - } - bool create_new_table_reader = false; TableReader* table_reader = nullptr; Cache::Handle* handle = nullptr; @@ -226,14 +215,15 @@ InternalIterator* TableCache::NewIterator( } } } + InternalIterator* result = nullptr; if (s.ok()) { - InternalIterator* result = - table_reader->NewIterator(options, arena, skip_filters); + result = table_reader->NewIterator(options, arena, skip_filters); if (create_new_table_reader) { assert(handle == nullptr); result->RegisterCleanup(&DeleteTableReader, table_reader, nullptr); } else if (handle != nullptr) { result->RegisterCleanup(&UnrefEntry, cache_, handle); + handle = nullptr; // prevent from releasing below } if (for_compaction) { @@ -242,45 +232,26 @@ InternalIterator* TableCache::NewIterator( if (table_reader_ptr != nullptr) { *table_reader_ptr = table_reader; } - return result; } - if (handle != nullptr) { - ReleaseHandle(handle); - } - return NewErrorInternalIterator(s, arena); -} - -InternalIterator* TableCache::NewRangeDeletionIterator( - const ReadOptions& options, const InternalKeyComparator& icmp, - const FileDescriptor& fd, HistogramImpl* file_read_hist, bool skip_filters, - int level) { - if (options.ignore_range_deletions) { - return nullptr; - } - Status s; - TableReader* table_reader = fd.table_reader; - Cache::Handle* cache_handle = nullptr; - if (table_reader == nullptr) { - s = FindTable(env_options_, icmp, fd, &cache_handle, - options.read_tier == kBlockCacheTier /* no_io */, - true /* record_read_stats */, file_read_hist, skip_filters, - level); + if (s.ok() && range_del_agg != nullptr && !options.ignore_range_deletions) { + std::unique_ptr range_del_iter( + table_reader->NewRangeTombstoneIterator(options)); + if (range_del_iter != nullptr) { + s = range_del_iter->status(); + } if (s.ok()) { - table_reader = GetTableReaderFromHandle(cache_handle); + s = range_del_agg->AddTombstones(std::move(range_del_iter)); } } - if (s.ok()) { - auto* result = table_reader->NewRangeTombstoneIterator(options); - if (cache_handle != nullptr) { - if (result == nullptr) { - ReleaseHandle(cache_handle); - } else { - result->RegisterCleanup(&UnrefEntry, cache_, cache_handle); - } - } - return result; + + if (handle != nullptr) { + ReleaseHandle(handle); } - return NewErrorInternalIterator(s); + if (!s.ok()) { + assert(result == nullptr); + result = NewErrorInternalIterator(s, arena); + } + return result; } Status TableCache::Get(const ReadOptions& options, @@ -288,67 +259,52 @@ Status TableCache::Get(const ReadOptions& options, const FileDescriptor& fd, const Slice& k, GetContext* get_context, HistogramImpl* file_read_hist, bool skip_filters, int level) { - Status s; - if (get_context->range_del_agg() != nullptr && - !options.ignore_range_deletions) { - std::unique_ptr range_del_iter(NewRangeDeletionIterator( - options, internal_comparator, fd, file_read_hist, skip_filters, level)); - if (range_del_iter != nullptr) { - s = range_del_iter->status(); - } - if (s.ok()) { - s = get_context->range_del_agg()->AddTombstones( - std::move(range_del_iter)); - } - } - - TableReader* t = fd.table_reader; - Cache::Handle* handle = nullptr; std::string* row_cache_entry = nullptr; bool done = false; #ifndef ROCKSDB_LITE IterKey row_cache_key; std::string row_cache_entry_buffer; - if (s.ok()) { - // Check row cache if enabled. Since row cache does not currently store - // sequence numbers, we cannot use it if we need to fetch the sequence. - if (ioptions_.row_cache && !get_context->NeedToReadSequence()) { - uint64_t fd_number = fd.GetNumber(); - auto user_key = ExtractUserKey(k); - // We use the user key as cache key instead of the internal key, - // otherwise the whole cache would be invalidated every time the - // sequence key increases. However, to support caching snapshot - // reads, we append the sequence number (incremented by 1 to - // distinguish from 0) only in this case. - uint64_t seq_no = - options.snapshot == nullptr ? 0 : 1 + GetInternalKeySeqno(k); - - // Compute row cache key. - row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(), - row_cache_id_.size()); - AppendVarint64(&row_cache_key, fd_number); - AppendVarint64(&row_cache_key, seq_no); - row_cache_key.TrimAppend(row_cache_key.Size(), user_key.data(), - user_key.size()); - - if (auto row_handle = - ioptions_.row_cache->Lookup(row_cache_key.GetKey())) { - auto found_row_cache_entry = static_cast( - ioptions_.row_cache->Value(row_handle)); - replayGetContextLog(*found_row_cache_entry, user_key, get_context); - ioptions_.row_cache->Release(row_handle); - RecordTick(ioptions_.statistics, ROW_CACHE_HIT); - done = true; - } else { - // Not found, setting up the replay log. - RecordTick(ioptions_.statistics, ROW_CACHE_MISS); - row_cache_entry = &row_cache_entry_buffer; - } + // Check row cache if enabled. Since row cache does not currently store + // sequence numbers, we cannot use it if we need to fetch the sequence. + if (ioptions_.row_cache && !get_context->NeedToReadSequence()) { + uint64_t fd_number = fd.GetNumber(); + auto user_key = ExtractUserKey(k); + // We use the user key as cache key instead of the internal key, + // otherwise the whole cache would be invalidated every time the + // sequence key increases. However, to support caching snapshot + // reads, we append the sequence number (incremented by 1 to + // distinguish from 0) only in this case. + uint64_t seq_no = + options.snapshot == nullptr ? 0 : 1 + GetInternalKeySeqno(k); + + // Compute row cache key. + row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(), + row_cache_id_.size()); + AppendVarint64(&row_cache_key, fd_number); + AppendVarint64(&row_cache_key, seq_no); + row_cache_key.TrimAppend(row_cache_key.Size(), user_key.data(), + user_key.size()); + + if (auto row_handle = + ioptions_.row_cache->Lookup(row_cache_key.GetKey())) { + auto found_row_cache_entry = static_cast( + ioptions_.row_cache->Value(row_handle)); + replayGetContextLog(*found_row_cache_entry, user_key, get_context); + ioptions_.row_cache->Release(row_handle); + RecordTick(ioptions_.statistics, ROW_CACHE_HIT); + done = true; + } else { + // Not found, setting up the replay log. + RecordTick(ioptions_.statistics, ROW_CACHE_MISS); + row_cache_entry = &row_cache_entry_buffer; } } #endif // ROCKSDB_LITE + Status s; + TableReader* t = fd.table_reader; + Cache::Handle* handle = nullptr; if (!done && s.ok()) { - if (!t) { + if (t == nullptr) { s = FindTable(env_options_, internal_comparator, fd, &handle, options.read_tier == kBlockCacheTier /* no_io */, true /* record_read_stats */, file_read_hist, skip_filters, @@ -368,6 +324,19 @@ Status TableCache::Get(const ReadOptions& options, done = true; } } + if (!done && s.ok() && get_context->range_del_agg() != nullptr && + !options.ignore_range_deletions) { + std::unique_ptr range_del_iter( + t->NewRangeTombstoneIterator(options)); + if (range_del_iter != nullptr) { + s = range_del_iter->status(); + } + if (s.ok()) { + s = get_context->range_del_agg()->AddTombstones( + std::move(range_del_iter)); + } + } + #ifndef ROCKSDB_LITE // Put the replay log in row cache only if something was found. if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) { diff --git a/db/table_cache.h b/db/table_cache.h index e5f31770b..7165283b6 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -58,16 +58,6 @@ class TableCache { HistogramImpl* file_read_hist = nullptr, bool for_compaction = false, Arena* arena = nullptr, bool skip_filters = false, int level = -1); - // Return an iterator over the range deletion meta-block for the specified - // file number. - // @param skip_filters Disables loading/accessing the filter block - // @param level The level this table is at, -1 for "not set / don't know" - InternalIterator* NewRangeDeletionIterator(const ReadOptions& options, - const InternalKeyComparator& icmp, - const FileDescriptor& fd, - HistogramImpl* file_read_hist, - bool skip_filters, int level); - // If a seek to internal key "k" in specified file finds an entry, // call (*handle_result)(arg, found_key, found_value) repeatedly until // it returns false.