From f8509653ba50f1e157ceb68f44159c9a28789240 Mon Sep 17 00:00:00 2001
From: sdong <sdong@fb.com>
Date: Wed, 9 Oct 2013 17:04:40 -0700
Subject: [PATCH] LRUCache to try to clean entries not referenced first.

Summary:
With this patch, when LRUCache.Insert() is called and the cache is full, it will first try to free up entries whose reference counter is 1 (would become 0 after remo\
ving from the cache). We do it in two passes, in the first pass, we only try to release those unreferenced entries. If we cannot free enough space after traversing t\
he first remove_scan_cnt_ entries, we start from the beginning again and remove those entries being used.

Test Plan: add two unit tests to cover the codes

Reviewers: dhruba, haobo, emayanke

Reviewed By: emayanke

CC: leveldb, emayanke, xjin

Differential Revision: https://reviews.facebook.net/D13377
---
 db/db_bench.cc            |   5 +-
 db/table_cache.cc         |   5 +-
 include/rocksdb/cache.h   |  16 +++-
 include/rocksdb/options.h |   9 ++
 util/cache.cc             |  49 +++++++++--
 util/cache_test.cc        | 170 ++++++++++++++++++++++++++++++++++++--
 util/options.cc           |   3 +
 7 files changed, 238 insertions(+), 19 deletions(-)
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 523fc7220..496ca3619 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -196,6 +196,8 @@ static const char* FLAGS_db = nullptr;
 // if FLAGS_cache_size is non-negative.
 static int FLAGS_cache_numshardbits = -1;
 
+static int FLAGS_cache_remove_scan_count_limit = 32;
+
 // Verify checksum for every block read from storage
 static bool FLAGS_verify_checksum = false;
 
@@ -816,7 +818,8 @@ class Benchmark {
   Benchmark()
   : cache_(FLAGS_cache_size >= 0 ?
            (FLAGS_cache_numshardbits >= 1 ?
-            NewLRUCache(FLAGS_cache_size, FLAGS_cache_numshardbits) :
+            NewLRUCache(FLAGS_cache_size, FLAGS_cache_numshardbits,
+                        FLAGS_cache_remove_scan_count_limit) :
             NewLRUCache(FLAGS_cache_size)) : nullptr),
     filter_policy_(FLAGS_bloom_bits >= 0
                    ? NewBloomFilterPolicy(FLAGS_bloom_bits)
diff --git a/db/table_cache.cc b/db/table_cache.cc
index cf544fefe..cf284b60a 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -32,7 +32,10 @@ TableCache::TableCache(const std::string& dbname,
       dbname_(dbname),
       options_(options),
       storage_options_(storage_options),
-      cache_(NewLRUCache(entries, options->table_cache_numshardbits)) {}
+      cache_(
+        NewLRUCache(entries, options->table_cache_numshardbits,
+                    options->table_cache_remove_scan_count_limit)) {
+}
 
 TableCache::~TableCache() {
 }
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index f9106511d..c912c9ccd 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -28,10 +28,22 @@ using std::shared_ptr;
 
 class Cache;
 
-// Create a new cache with a fixed size capacity.  This implementation
-// of Cache uses a least-recently-used eviction policy.
+// Create a new cache with a fixed size capacity. The cache is sharded
+// to 2^numShardBits shards, by hash of the key. The total capacity
+// is divided and evenly assigned to each shard. Inside each shard,
+// the eviction is done in two passes: first try to free spaces by
+// evicting entries that are among the most least used removeScanCountLimit
+// entries and do not have reference other than by the cache itself, in
+// the least-used order. If not enough space is freed, further free the
+// entries in least used order.
+//
+// The functions without parameter numShardBits and/or removeScanCountLimit
+// use default values. removeScanCountLimit's default value is 0, which
+// means a strict LRU order inside each shard.
 extern shared_ptr<Cache> NewLRUCache(size_t capacity);
 extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits);
+extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
+                                     int removeScanCountLimit);
 
 class Cache {
  public:
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index a160acee1..9bb5330bd 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -442,6 +442,15 @@ struct Options {
   // Number of shards used for table cache.
   int table_cache_numshardbits;
 
+  // During data eviction of table's LRU cache, it would be inefficient
+  // to strictly follow LRU because this piece of memory will not really
+  // be released unless its refcount falls to zero. Instead, make two
+  // passes: the first pass will release items with refcount = 1,
+  // and if not enough space releases after scanning the number of
+  // elements specified by this parameter, we will remove items in LRU
+  // order.
+  int table_cache_remove_scan_count_limit;
+
   // size of one block in arena memory allocation.
   // If <= 0, a proper value is automatically calculated (usually 1/10 of
   // writer_buffer_size).
diff --git a/util/cache.cc b/util/cache.cc
index df8b63fc5..cfca0aed2 100644
--- a/util/cache.cc
+++ b/util/cache.cc
@@ -140,6 +140,9 @@ class LRUCache {
 
   // Separate from constructor so caller can easily make an array of LRUCache
   void SetCapacity(size_t capacity) { capacity_ = capacity; }
+  void SetRemoveScanCountLimit(size_t remove_scan_count_limit) {
+    remove_scan_count_limit_ = remove_scan_count_limit;
+  }
 
   // Like Cache methods, but with an extra "hash" parameter.
   Cache::Handle* Insert(const Slice& key, uint32_t hash,
@@ -160,6 +163,7 @@ class LRUCache {
 
   // Initialized before use.
   size_t capacity_;
+  uint32_t remove_scan_count_limit_;
 
   // mutex_ protects the following state.
   port::Mutex mutex_;
@@ -270,6 +274,27 @@ Cache::Handle* LRUCache::Insert(
       }
     }
 
+    if (remove_scan_count_limit_ > 0) {
+      // Try to free the space by evicting the entries that are only
+      // referenced by the cache first.
+      LRUHandle* cur = lru_.next;
+      for (unsigned int scanCount = 0;
+           usage_ > capacity_ && cur != &lru_
+           && scanCount < remove_scan_count_limit_; scanCount++) {
+        LRUHandle* next = cur->next;
+        if (cur->refs <= 1) {
+          LRU_Remove(cur);
+          table_.Remove(cur->key(), cur->hash);
+          if (Unref(cur)) {
+            last_reference_list.push_back(cur);
+          }
+        }
+        cur = next;
+      }
+    }
+
+    // Free the space following strict LRU policy until enough space
+    // is freed.
     while (usage_ > capacity_ && lru_.next != &lru_) {
       LRUHandle* old = lru_.next;
       LRU_Remove(old);
@@ -307,7 +332,8 @@ void LRUCache::Erase(const Slice& key, uint32_t hash) {
   }
 }
 
-static int kNumShardBits = 4;         // default values, can be overridden
+static int kNumShardBits = 4;          // default values, can be overridden
+static int kRemoveScanCountLimit = 0; // default values, can be overridden
 
 class ShardedLRUCache : public Cache {
  private:
@@ -326,7 +352,7 @@ class ShardedLRUCache : public Cache {
     return (numShardBits > 0) ? (hash >> (32 - numShardBits)) : 0;
   }
 
-  void init(size_t capacity, int numbits) {
+  void init(size_t capacity, int numbits, int removeScanCountLimit) {
     numShardBits = numbits;
     capacity_ = capacity;
     int numShards = 1 << numShardBits;
@@ -334,17 +360,19 @@ class ShardedLRUCache : public Cache {
     const size_t per_shard = (capacity + (numShards - 1)) / numShards;
     for (int s = 0; s < numShards; s++) {
       shard_[s].SetCapacity(per_shard);
+      shard_[s].SetRemoveScanCountLimit(removeScanCountLimit);
     }
   }
 
  public:
   explicit ShardedLRUCache(size_t capacity)
       : last_id_(0) {
-    init(capacity, kNumShardBits);
+    init(capacity, kNumShardBits, kRemoveScanCountLimit);
   }
-  ShardedLRUCache(size_t capacity, int numShardBits)
+  ShardedLRUCache(size_t capacity, int numShardBits,
+                  int removeScanCountLimit)
      : last_id_(0) {
-    init(capacity, numShardBits);
+    init(capacity, numShardBits, removeScanCountLimit);
   }
   virtual ~ShardedLRUCache() {
     delete[] shard_;
@@ -381,14 +409,21 @@ class ShardedLRUCache : public Cache {
 }  // end anonymous namespace
 
 shared_ptr<Cache> NewLRUCache(size_t capacity) {
-  return std::make_shared<ShardedLRUCache>(capacity);
+  return NewLRUCache(capacity, kNumShardBits);
 }
 
 shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits) {
+  return NewLRUCache(capacity, numShardBits, kRemoveScanCountLimit);
+}
+
+shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
+                              int removeScanCountLimit) {
   if (numShardBits >= 20) {
     return nullptr;  // the cache cannot be sharded into too many fine pieces
   }
-  return std::make_shared<ShardedLRUCache>(capacity, numShardBits);
+  return std::make_shared<ShardedLRUCache>(capacity,
+                                           numShardBits,
+                                           removeScanCountLimit);
 }
 
 }  // namespace rocksdb
diff --git a/util/cache_test.cc b/util/cache_test.cc
index 00c02589a..fe3bfd68f 100644
--- a/util/cache_test.cc
+++ b/util/cache_test.cc
@@ -35,33 +35,69 @@ class CacheTest {
   }
 
   static const int kCacheSize = 1000;
+  static const int kNumShardBits = 4;
+  static const int kRemoveScanCountLimit = 16;
+
+  static const int kCacheSize2 = 100;
+  static const int kNumShardBits2 = 2;
+  static const int kRemoveScanCountLimit2 = 200;
+
   std::vector<int> deleted_keys_;
   std::vector<int> deleted_values_;
   shared_ptr<Cache> cache_;
+  shared_ptr<Cache> cache2_;
 
-  CacheTest() : cache_(NewLRUCache(kCacheSize)) {
+  CacheTest() :
+      cache_(NewLRUCache(kCacheSize, kNumShardBits, kRemoveScanCountLimit)),
+      cache2_(NewLRUCache(kCacheSize2, kNumShardBits2,
+                          kRemoveScanCountLimit2)) {
     current_ = this;
   }
 
   ~CacheTest() {
   }
 
-  int Lookup(int key) {
-    Cache::Handle* handle = cache_->Lookup(EncodeKey(key));
-    const int r = (handle == nullptr) ? -1 : DecodeValue(cache_->Value(handle));
+  int Lookup(shared_ptr<Cache> cache, int key) {
+    Cache::Handle* handle = cache->Lookup(EncodeKey(key));
+    const int r = (handle == nullptr) ? -1 : DecodeValue(cache->Value(handle));
     if (handle != nullptr) {
-      cache_->Release(handle);
+      cache->Release(handle);
     }
     return r;
   }
 
+  void Insert(shared_ptr<Cache> cache, int key, int value, int charge = 1) {
+    cache->Release(cache->Insert(EncodeKey(key), EncodeValue(value), charge,
+                                  &CacheTest::Deleter));
+  }
+
+  void Erase(shared_ptr<Cache> cache, int key) {
+    cache->Erase(EncodeKey(key));
+  }
+
+
+  int Lookup(int key) {
+    return Lookup(cache_, key);
+  }
+
   void Insert(int key, int value, int charge = 1) {
-    cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge,
-                                   &CacheTest::Deleter));
+    Insert(cache_, key, value, charge);
   }
 
   void Erase(int key) {
-    cache_->Erase(EncodeKey(key));
+    Erase(cache_, key);
+  }
+
+  int Lookup2(int key) {
+    return Lookup(cache2_, key);
+  }
+
+  void Insert2(int key, int value, int charge = 1) {
+    Insert(cache2_, key, value, charge);
+  }
+
+  void Erase2(int key) {
+    Erase(cache2_, key);
   }
 };
 CacheTest* CacheTest::current_;
@@ -147,6 +183,124 @@ TEST(CacheTest, EvictionPolicy) {
   ASSERT_EQ(-1, Lookup(200));
 }
 
+TEST(CacheTest, EvictionPolicyRef) {
+  Insert(100, 101);
+  Insert(101, 102);
+  Insert(102, 103);
+  Insert(103, 104);
+  Insert(200, 101);
+  Insert(201, 102);
+  Insert(202, 103);
+  Insert(203, 104);
+  Cache::Handle* h201 = cache_->Lookup(EncodeKey(200));
+  Cache::Handle* h202 = cache_->Lookup(EncodeKey(201));
+  Cache::Handle* h203 = cache_->Lookup(EncodeKey(202));
+  Cache::Handle* h204 = cache_->Lookup(EncodeKey(203));
+  Insert(300, 101);
+  Insert(301, 102);
+  Insert(302, 103);
+  Insert(303, 104);
+
+  // Insert entries much more than Cache capacity
+  for (int i = 0; i < kCacheSize + 100; i++) {
+    Insert(1000 + i, 2000 + i);
+  }
+
+  // Check whether the entries inserted in the beginning
+  // are evicted. Ones without extra ref are evicted and
+  // those with are not.
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(-1, Lookup(101));
+  ASSERT_EQ(-1, Lookup(102));
+  ASSERT_EQ(-1, Lookup(103));
+
+  ASSERT_EQ(-1, Lookup(300));
+  ASSERT_EQ(-1, Lookup(301));
+  ASSERT_EQ(-1, Lookup(302));
+  ASSERT_EQ(-1, Lookup(303));
+
+  ASSERT_EQ(101, Lookup(200));
+  ASSERT_EQ(102, Lookup(201));
+  ASSERT_EQ(103, Lookup(202));
+  ASSERT_EQ(104, Lookup(203));
+
+  // Cleaning up all the handles
+  cache_->Release(h201);
+  cache_->Release(h202);
+  cache_->Release(h203);
+  cache_->Release(h204);
+}
+
+TEST(CacheTest, EvictionPolicyRef2) {
+  std::vector<Cache::Handle*> handles;
+
+  Insert(100, 101);
+  // Insert entries much more than Cache capacity
+  for (int i = 0; i < kCacheSize + 100; i++) {
+    Insert(1000 + i, 2000 + i);
+    if (i < kCacheSize ) {
+      handles.push_back(cache_->Lookup(EncodeKey(1000 + i)));
+    }
+  }
+
+  // Make sure referenced keys are also possible to be deleted
+  // if there are not sufficient non-referenced keys
+  for (int i = 0; i < 5; i++) {
+    ASSERT_EQ(-1, Lookup(1000 + i));
+  }
+
+  for (int i = kCacheSize; i < kCacheSize + 100; i++) {
+    ASSERT_EQ(2000 + i, Lookup(1000 + i));
+  }
+  ASSERT_EQ(-1, Lookup(100));
+
+  // Cleaning up all the handles
+  while (handles.size() > 0) {
+    cache_->Release(handles.back());
+    handles.pop_back();
+  }
+}
+
+TEST(CacheTest, EvictionPolicyRefLargeScanLimit) {
+  std::vector<Cache::Handle*> handles2;
+
+  // Cache2 has a cache RemoveScanCountLimit higher than cache size
+  // so it would trigger a boundary condition.
+
+  // Populate the cache with 10 more keys than its size.
+  // Reference all keys except one close to the end.
+  for (int i = 0; i < kCacheSize2 + 10; i++) {
+    Insert2(1000 + i, 2000+i);
+    if (i != kCacheSize2 ) {
+      handles2.push_back(cache2_->Lookup(EncodeKey(1000 + i)));
+    }
+  }
+
+  // Make sure referenced keys are also possible to be deleted
+  // if there are not sufficient non-referenced keys
+  for (int i = 0; i < 3; i++) {
+    ASSERT_EQ(-1, Lookup2(1000 + i));
+  }
+  // The non-referenced value is deleted even if it's accessed
+  // recently.
+  ASSERT_EQ(-1, Lookup2(1000 + kCacheSize2));
+  // Other values recently accessed are not deleted since they
+  // are referenced.
+  for (int i = kCacheSize2 - 10; i < kCacheSize2 + 10; i++) {
+    if (i != kCacheSize2) {
+      ASSERT_EQ(2000 + i, Lookup2(1000 + i));
+    }
+  }
+
+  // Cleaning up all the handles
+  while (handles2.size() > 0) {
+    cache2_->Release(handles2.back());
+    handles2.pop_back();
+  }
+}
+
+
+
 TEST(CacheTest, HeavyEntries) {
   // Add a bunch of light and heavy entries and then count the combined
   // size of items still in the cache, which must be approximately the
diff --git a/util/options.cc b/util/options.cc
index fe7a3689d..c1f0d8b06 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -65,6 +65,7 @@ Options::Options()
       max_manifest_file_size(std::numeric_limits<uint64_t>::max()),
       no_block_cache(false),
       table_cache_numshardbits(4),
+      table_cache_remove_scan_count_limit(16),
       arena_block_size(0),
       disable_auto_compactions(false),
       WAL_ttl_seconds(0),
@@ -197,6 +198,8 @@ Options::Dump(Logger* log) const
         no_block_cache);
     Log(log,"               Options.table_cache_numshardbits: %d",
         table_cache_numshardbits);
+    Log(log,"    Options.table_cache_remove_scan_count_limit: %d",
+        table_cache_remove_scan_count_limit);
     Log(log,"                       Options.arena_block_size: %ld",
         arena_block_size);
     Log(log,"    Options.delete_obsolete_files_period_micros: %ld",