From 66b5613d0c3f84e5ef72c43b62a2e9866efdde8a Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 22 Jul 2019 18:53:03 -0700
Subject: [PATCH] row_cache to share entry for recent snapshots (#5600)

Summary:
Right now, users cannot take advantage of row cache, unless no snapshot is used, or Get() is repeated for the same snapshots. This limits the usage of row cache.
This change eliminate this restriction in some cases. If the snapshot used is newer than the largest sequence number in the file, and write callback function is not registered, the same row cache key is used as no snapshot is given. We still need the callback function restriction for now because the callback function may filter out different keys for different snapshots even if the snapshots are new.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5600

Test Plan: Add a unit test.

Differential Revision: D16386616

fbshipit-source-id: 6b7d214bd215d191b03ccf55926ad4b703ec2e53
---
 HISTORY.md          |  1 +
 db/db_test2.cc      | 46 +++++++++++++++++++++++++++++++++++++++++++++
 db/table_cache.cc   | 21 +++++++++++++++++++--
 table/get_context.h |  2 ++
 4 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index b9d0f7413..efd49f642 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -33,6 +33,7 @@
 * Reduce iterator key comparision for upper/lower bound check.
 * Log Writer will flush after finishing the whole record, rather than a fragment.
 * Lower MultiGet batching API latency by reading data blocks from disk in parallel
+* Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases.
 
 ### General Improvements
 * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress.
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 109a7a377..3664b3a24 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -3771,6 +3771,52 @@ TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
   delete db_;
   db_ = nullptr;
 }
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RowCacheSnapshot) {
+  Options options = CurrentOptions();
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar1"));
+
+  const Snapshot* s1 = db_->GetSnapshot();
+
+  ASSERT_OK(Put("foo", "bar2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("foo2", "bar"));
+  const Snapshot* s2 = db_->GetSnapshot();
+  ASSERT_OK(Put("foo3", "bar"));
+  const Snapshot* s3 = db_->GetSnapshot();
+
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s2), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s3), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+
+  db_->ReleaseSnapshot(s1);
+  db_->ReleaseSnapshot(s2);
+  db_->ReleaseSnapshot(s3);
+}
+#endif  // ROCKSDB_LITE
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 121d4941f..2290b5939 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -11,6 +11,7 @@
 
 #include "db/dbformat.h"
 #include "db/range_tombstone_fragmenter.h"
+#include "db/snapshot_impl.h"
 #include "db/version_edit.h"
 #include "file/filename.h"
 
@@ -24,6 +25,7 @@
 #include "table/table_builder.h"
 #include "table/table_reader.h"
 #include "test_util/sync_point.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
@@ -277,8 +279,23 @@ Status TableCache::Get(const ReadOptions& options,
     // sequence key increases. However, to support caching snapshot
     // reads, we append the sequence number (incremented by 1 to
     // distinguish from 0) only in this case.
-    uint64_t seq_no =
-        options.snapshot == nullptr ? 0 : 1 + GetInternalKeySeqno(k);
+    // If the snapshot is larger than the largest seqno in the file,
+    // all data should be exposed to the snapshot, so we treat it
+    // the same as there is no snapshot. The exception is that if
+    // a seq-checking callback is registered, some internal keys
+    // may still be filtered out.
+    uint64_t seq_no = 0;
+    // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
+    if (options.snapshot != nullptr &&
+        (get_context->has_callback() ||
+         static_cast_with_check<const SnapshotImpl, const Snapshot>(
+             options.snapshot)
+                 ->GetSequenceNumber() <= fd.largest_seqno)) {
+      // We should consider to use options.snapshot->GetSequenceNumber()
+      // instead of GetInternalKeySeqno(k), which will make the code
+      // easier to understand.
+      seq_no = 1 + GetInternalKeySeqno(k);
+    }
 
     // Compute row cache key.
     row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
diff --git a/table/get_context.h b/table/get_context.h
index 7a37beb2d..7110ceae8 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -136,6 +136,8 @@ class GetContext {
 
   void ReportCounters();
 
+  bool has_callback() const { return callback_ != nullptr; }
+
   uint64_t get_tracing_get_id() const { return tracing_get_id_; }
 
  private: