From 734e4acafbcea77934d5164c424b5084f0422c82 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrewkr@fb.com>
Date: Mon, 21 Nov 2016 21:08:06 -0800
Subject: [PATCH] Eliminate redundant cache lookup with range deletion

Summary:
When we introduced range deletion block, TableCache::Get() and TableCache::NewIterator() each did two table cache lookups, one for range deletion block iterator and another for getting the table reader to which the Get()/NewIterator() is delegated. This extra cache lookup was very CPU-intensive (about 10% overhead in a read-heavy benchmark). We can avoid it by reusing the Cache::Handle created for range deletion block iterator to get the file reader.
Closes https://github.com/facebook/rocksdb/pull/1537

Differential Revision: D4201167

Pulled By: ajkr

fbshipit-source-id: d33ffd8
---
 db/db_compaction_test.cc |  21 +++--
 db/table_cache.cc        | 167 ++++++++++++++++-----------------------
 db/table_cache.h         |  10 ---
 3 files changed, 77 insertions(+), 121 deletions(-)
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 0c2492216..34edc5387 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -294,8 +294,7 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
       num_new_table_reader = 0;
       ASSERT_EQ(Key(k), Get(Key(k)));
       // lookup iterator from table cache and no need to create a new one.
-      // a second table cache iterator is created for range tombstones
-      ASSERT_EQ(num_table_cache_lookup, 2);
+      ASSERT_EQ(num_table_cache_lookup, 1);
       ASSERT_EQ(num_new_table_reader, 0);
     }
   }
@@ -306,9 +305,9 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
   dbfull()->TEST_WaitForCompact();
   // Preloading iterator issues one table cache lookup and creates
   // a new table reader. One file is created for flush and one for compaction.
-  // Compaction inputs make no table cache look-up for data iterators and one
-  // look-up per compaction input file (three).
-  ASSERT_EQ(num_table_cache_lookup, 5);
+  // Compaction inputs make no table cache look-up for data/range deletion
+  // iterators
+  ASSERT_EQ(num_table_cache_lookup, 2);
   // Create new iterator for:
   // (1) 1 for verifying flush results
   // (2) 3 for compaction input files
@@ -318,8 +317,7 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
   num_table_cache_lookup = 0;
   num_new_table_reader = 0;
   ASSERT_EQ(Key(1), Get(Key(1)));
-  // a second table cache iterator is created for range tombstones
-  ASSERT_EQ(num_table_cache_lookup, 2);
+  ASSERT_EQ(num_table_cache_lookup, 1);
   ASSERT_EQ(num_new_table_reader, 0);
 
   num_table_cache_lookup = 0;
@@ -329,17 +327,16 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
   cro.target_level = 2;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   db_->CompactRange(cro, nullptr, nullptr);
-  // Only verifying compaction outputs issues two table cache lookup
-  // (one for data block, one for range deletion block).
-  ASSERT_EQ(num_table_cache_lookup, 2);
+  // Only verifying compaction outputs issues one table cache lookup
+  // for both data block and range deletion block).
+  ASSERT_EQ(num_table_cache_lookup, 1);
   // One for compaction input, one for verifying compaction results.
   ASSERT_EQ(num_new_table_reader, 2);
 
   num_table_cache_lookup = 0;
   num_new_table_reader = 0;
   ASSERT_EQ(Key(1), Get(Key(1)));
-  // a second table cache iterator is created for range tombstones
-  ASSERT_EQ(num_table_cache_lookup, 2);
+  ASSERT_EQ(num_table_cache_lookup, 1);
   ASSERT_EQ(num_new_table_reader, 0);
 
   rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 6688dd124..908bbf943 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -175,17 +175,6 @@ InternalIterator* TableCache::NewIterator(
   PERF_TIMER_GUARD(new_table_iterator_nanos);
 
   Status s;
-  if (range_del_agg != nullptr && !options.ignore_range_deletions) {
-    std::unique_ptr<InternalIterator> range_del_iter(NewRangeDeletionIterator(
-        options, icomparator, fd, file_read_hist, skip_filters, level));
-    if (range_del_iter != nullptr) {
-      s = range_del_iter->status();
-    }
-    if (s.ok()) {
-      s = range_del_agg->AddTombstones(std::move(range_del_iter));
-    }
-  }
-
   bool create_new_table_reader = false;
   TableReader* table_reader = nullptr;
   Cache::Handle* handle = nullptr;
@@ -226,14 +215,15 @@ InternalIterator* TableCache::NewIterator(
       }
     }
   }
+  InternalIterator* result = nullptr;
   if (s.ok()) {
-    InternalIterator* result =
-        table_reader->NewIterator(options, arena, skip_filters);
+    result = table_reader->NewIterator(options, arena, skip_filters);
     if (create_new_table_reader) {
       assert(handle == nullptr);
       result->RegisterCleanup(&DeleteTableReader, table_reader, nullptr);
     } else if (handle != nullptr) {
       result->RegisterCleanup(&UnrefEntry, cache_, handle);
+      handle = nullptr;  // prevent from releasing below
     }
 
     if (for_compaction) {
@@ -242,45 +232,26 @@ InternalIterator* TableCache::NewIterator(
     if (table_reader_ptr != nullptr) {
       *table_reader_ptr = table_reader;
     }
-    return result;
   }
-  if (handle != nullptr) {
-    ReleaseHandle(handle);
-  }
-  return NewErrorInternalIterator(s, arena);
-}
-
-InternalIterator* TableCache::NewRangeDeletionIterator(
-    const ReadOptions& options, const InternalKeyComparator& icmp,
-    const FileDescriptor& fd, HistogramImpl* file_read_hist, bool skip_filters,
-    int level) {
-  if (options.ignore_range_deletions) {
-    return nullptr;
-  }
-  Status s;
-  TableReader* table_reader = fd.table_reader;
-  Cache::Handle* cache_handle = nullptr;
-  if (table_reader == nullptr) {
-    s = FindTable(env_options_, icmp, fd, &cache_handle,
-                  options.read_tier == kBlockCacheTier /* no_io */,
-                  true /* record_read_stats */, file_read_hist, skip_filters,
-                  level);
+  if (s.ok() && range_del_agg != nullptr && !options.ignore_range_deletions) {
+    std::unique_ptr<InternalIterator> range_del_iter(
+        table_reader->NewRangeTombstoneIterator(options));
+    if (range_del_iter != nullptr) {
+      s = range_del_iter->status();
+    }
     if (s.ok()) {
-      table_reader = GetTableReaderFromHandle(cache_handle);
+      s = range_del_agg->AddTombstones(std::move(range_del_iter));
     }
   }
-  if (s.ok()) {
-    auto* result = table_reader->NewRangeTombstoneIterator(options);
-    if (cache_handle != nullptr) {
-      if (result == nullptr) {
-        ReleaseHandle(cache_handle);
-      } else {
-        result->RegisterCleanup(&UnrefEntry, cache_, cache_handle);
-      }
-    }
-    return result;
+
+  if (handle != nullptr) {
+    ReleaseHandle(handle);
   }
-  return NewErrorInternalIterator(s);
+  if (!s.ok()) {
+    assert(result == nullptr);
+    result = NewErrorInternalIterator(s, arena);
+  }
+  return result;
 }
 
 Status TableCache::Get(const ReadOptions& options,
@@ -288,67 +259,52 @@ Status TableCache::Get(const ReadOptions& options,
                        const FileDescriptor& fd, const Slice& k,
                        GetContext* get_context, HistogramImpl* file_read_hist,
                        bool skip_filters, int level) {
-  Status s;
-  if (get_context->range_del_agg() != nullptr &&
-      !options.ignore_range_deletions) {
-    std::unique_ptr<InternalIterator> range_del_iter(NewRangeDeletionIterator(
-        options, internal_comparator, fd, file_read_hist, skip_filters, level));
-    if (range_del_iter != nullptr) {
-      s = range_del_iter->status();
-    }
-    if (s.ok()) {
-      s = get_context->range_del_agg()->AddTombstones(
-          std::move(range_del_iter));
-    }
-  }
-
-  TableReader* t = fd.table_reader;
-  Cache::Handle* handle = nullptr;
   std::string* row_cache_entry = nullptr;
   bool done = false;
 #ifndef ROCKSDB_LITE
   IterKey row_cache_key;
   std::string row_cache_entry_buffer;
-  if (s.ok()) {
-    // Check row cache if enabled. Since row cache does not currently store
-    // sequence numbers, we cannot use it if we need to fetch the sequence.
-    if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
-      uint64_t fd_number = fd.GetNumber();
-      auto user_key = ExtractUserKey(k);
-      // We use the user key as cache key instead of the internal key,
-      // otherwise the whole cache would be invalidated every time the
-      // sequence key increases. However, to support caching snapshot
-      // reads, we append the sequence number (incremented by 1 to
-      // distinguish from 0) only in this case.
-      uint64_t seq_no =
-          options.snapshot == nullptr ? 0 : 1 + GetInternalKeySeqno(k);
-
-      // Compute row cache key.
-      row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
-                               row_cache_id_.size());
-      AppendVarint64(&row_cache_key, fd_number);
-      AppendVarint64(&row_cache_key, seq_no);
-      row_cache_key.TrimAppend(row_cache_key.Size(), user_key.data(),
-                               user_key.size());
-
-      if (auto row_handle =
-              ioptions_.row_cache->Lookup(row_cache_key.GetKey())) {
-        auto found_row_cache_entry = static_cast<const std::string*>(
-            ioptions_.row_cache->Value(row_handle));
-        replayGetContextLog(*found_row_cache_entry, user_key, get_context);
-        ioptions_.row_cache->Release(row_handle);
-        RecordTick(ioptions_.statistics, ROW_CACHE_HIT);
-        done = true;
-      } else {
-        // Not found, setting up the replay log.
-        RecordTick(ioptions_.statistics, ROW_CACHE_MISS);
-        row_cache_entry = &row_cache_entry_buffer;
-      }
+  // Check row cache if enabled. Since row cache does not currently store
+  // sequence numbers, we cannot use it if we need to fetch the sequence.
+  if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
+    uint64_t fd_number = fd.GetNumber();
+    auto user_key = ExtractUserKey(k);
+    // We use the user key as cache key instead of the internal key,
+    // otherwise the whole cache would be invalidated every time the
+    // sequence key increases. However, to support caching snapshot
+    // reads, we append the sequence number (incremented by 1 to
+    // distinguish from 0) only in this case.
+    uint64_t seq_no =
+        options.snapshot == nullptr ? 0 : 1 + GetInternalKeySeqno(k);
+
+    // Compute row cache key.
+    row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
+                             row_cache_id_.size());
+    AppendVarint64(&row_cache_key, fd_number);
+    AppendVarint64(&row_cache_key, seq_no);
+    row_cache_key.TrimAppend(row_cache_key.Size(), user_key.data(),
+                             user_key.size());
+
+    if (auto row_handle =
+            ioptions_.row_cache->Lookup(row_cache_key.GetKey())) {
+      auto found_row_cache_entry = static_cast<const std::string*>(
+          ioptions_.row_cache->Value(row_handle));
+      replayGetContextLog(*found_row_cache_entry, user_key, get_context);
+      ioptions_.row_cache->Release(row_handle);
+      RecordTick(ioptions_.statistics, ROW_CACHE_HIT);
+      done = true;
+    } else {
+      // Not found, setting up the replay log.
+      RecordTick(ioptions_.statistics, ROW_CACHE_MISS);
+      row_cache_entry = &row_cache_entry_buffer;
     }
   }
 #endif  // ROCKSDB_LITE
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = nullptr;
   if (!done && s.ok()) {
-    if (!t) {
+    if (t == nullptr) {
       s = FindTable(env_options_, internal_comparator, fd, &handle,
                     options.read_tier == kBlockCacheTier /* no_io */,
                     true /* record_read_stats */, file_read_hist, skip_filters,
@@ -368,6 +324,19 @@ Status TableCache::Get(const ReadOptions& options,
       done = true;
     }
   }
+  if (!done && s.ok() && get_context->range_del_agg() != nullptr &&
+      !options.ignore_range_deletions) {
+    std::unique_ptr<InternalIterator> range_del_iter(
+        t->NewRangeTombstoneIterator(options));
+    if (range_del_iter != nullptr) {
+      s = range_del_iter->status();
+    }
+    if (s.ok()) {
+      s = get_context->range_del_agg()->AddTombstones(
+          std::move(range_del_iter));
+    }
+  }
+
 #ifndef ROCKSDB_LITE
   // Put the replay log in row cache only if something was found.
   if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) {
diff --git a/db/table_cache.h b/db/table_cache.h
index e5f31770b..7165283b6 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -58,16 +58,6 @@ class TableCache {
       HistogramImpl* file_read_hist = nullptr, bool for_compaction = false,
       Arena* arena = nullptr, bool skip_filters = false, int level = -1);
 
-  // Return an iterator over the range deletion meta-block for the specified
-  // file number.
-  // @param skip_filters Disables loading/accessing the filter block
-  // @param level The level this table is at, -1 for "not set / don't know"
-  InternalIterator* NewRangeDeletionIterator(const ReadOptions& options,
-                                             const InternalKeyComparator& icmp,
-                                             const FileDescriptor& fd,
-                                             HistogramImpl* file_read_hist,
-                                             bool skip_filters, int level);
-
   // If a seek to internal key "k" in specified file finds an entry,
   // call (*handle_result)(arg, found_key, found_value) repeatedly until
   // it returns false.