row_cache to share entry for recent snapshots (#5600)

Summary: Right now, users cannot take advantage of row cache, unless no snapshot is used, or Get() is repeated for the same snapshots. This limits the usage of row cache. This change eliminate this restriction in some cases. If the snapshot used is newer than the largest sequence number in the file, and write callback function is not registered, the same row cache key is used as no snapshot is given. We still need the callback function restriction for now because the callback function may filter out different keys for different snapshots even if the snapshots are new. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5600 Test Plan: Add a unit test. Differential Revision: D16386616 fbshipit-source-id: 6b7d214bd215d191b03ccf55926ad4b703ec2e53
6 years ago · 66b5613d0c
parent 3778470061
commit 66b5613d0c
4 changed files with 68 additions and 2 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -33,6 +33,7 @@
 * Reduce iterator key comparision for upper/lower bound check.
 * Log Writer will flush after finishing the whole record, rather than a fragment.
 * Lower MultiGet batching API latency by reading data blocks from disk in parallel
+* Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases.

 ### General Improvements
 * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress.
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@ -3771,6 +3771,52 @@ TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
  delete db_;
  db_ = nullptr;
 }
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RowCacheSnapshot) {
+  Options options = CurrentOptions();
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar1"));
+
+  const Snapshot* s1 = db_->GetSnapshot();
+
+  ASSERT_OK(Put("foo", "bar2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("foo2", "bar"));
+  const Snapshot* s2 = db_->GetSnapshot();
+  ASSERT_OK(Put("foo3", "bar"));
+  const Snapshot* s3 = db_->GetSnapshot();
+
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s2), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s3), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+
+  db_->ReleaseSnapshot(s1);
+  db_->ReleaseSnapshot(s2);
+  db_->ReleaseSnapshot(s3);
+}
+#endif  // ROCKSDB_LITE
 }  // namespace rocksdb

 int main(int argc, char** argv) {
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@ -11,6 +11,7 @@

 #include "db/dbformat.h"
 #include "db/range_tombstone_fragmenter.h"
+#include "db/snapshot_impl.h"
 #include "db/version_edit.h"
 #include "file/filename.h"

@ -24,6 +25,7 @@
 #include "table/table_builder.h"
 #include "table/table_reader.h"
 #include "test_util/sync_point.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
@ -277,8 +279,23 @@ Status TableCache::Get(const ReadOptions& options,
    // sequence key increases. However, to support caching snapshot
    // reads, we append the sequence number (incremented by 1 to
    // distinguish from 0) only in this case.
-    uint64_t seq_no =
-        options.snapshot == nullptr ? 0 : 1 + GetInternalKeySeqno(k);
+    // If the snapshot is larger than the largest seqno in the file,
+    // all data should be exposed to the snapshot, so we treat it
+    // the same as there is no snapshot. The exception is that if
+    // a seq-checking callback is registered, some internal keys
+    // may still be filtered out.
+    uint64_t seq_no = 0;
+    // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
+    if (options.snapshot != nullptr &&
+        (get_context->has_callback() ||
+         static_cast_with_check<const SnapshotImpl, const Snapshot>(
+             options.snapshot)
+                 ->GetSequenceNumber() <= fd.largest_seqno)) {
+      // We should consider to use options.snapshot->GetSequenceNumber()
+      // instead of GetInternalKeySeqno(k), which will make the code
+      // easier to understand.
+      seq_no = 1 + GetInternalKeySeqno(k);
+    }

    // Compute row cache key.
    row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
--- a/table/get_context.h
+++ b/table/get_context.h
@ -136,6 +136,8 @@ class GetContext {

  void ReportCounters();

+  bool has_callback() const { return callback_ != nullptr; }
+
  uint64_t get_tracing_get_id() const { return tracing_get_id_; }

 private: