rocksdb/utilities/persistent_cache/hash_table.h

//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once

#ifndef ROCKSDB_LITE

#include <assert.h>
#include <sys/mman.h>
#include <list>
#include <vector>

#include "include/rocksdb/env.h"
#include "port/port_posix.h"
#include "util/mutexlock.h"

namespace rocksdb {

// HashTable<T, Hash, Equal>
//
// Traditional implementation of hash table with syncronization built on top
// don't perform very well in multi-core scenarios. This is an implementation
// designed for multi-core scenarios with high lock contention.
//
//                         |<-------- alpha ------------->|
//               Buckets   Collision list
//          ---- +----+    +---+---+--- ...... ---+---+---+
//         /     |    |--->|   |   |              |   |   |
//        /      +----+    +---+---+--- ...... ---+---+---+
//       /       |    |
// Locks/        +----+
// +--+/         .    .
// |  |          .    .
// +--+          .    .
// |  |          .    .
// +--+          .    .
// |  |          .    .
// +--+          .    .
//     \         +----+
//      \        |    |
//       \       +----+
//        \      |    |
//         \---- +----+
//
// The lock contention is spread over an array of locks. This helps improve
// concurrent access. The spine is designed for a certain capacity and load
// factor. When the capacity planning is done correctly we can expect
// O(load_factor = 1) insert, access and remove time.
//
// Micro benchmark on debug build gives about .5 Million/sec rate of insert,
// erase and lookup in parallel (total of about 1.5 Million ops/sec). If the
// blocks were of 4K, the hash table can support  a virtual throughput of
// 6 GB/s.
//
// T      Object type (contains both key and value)
// Hash   Function that returns an hash from type T
// Equal  Returns if two objects are equal
//        (We need explicit equal for pointer type)
//
template <class T, class Hash, class Equal>
class HashTable {
 public:
  explicit HashTable(const size_t capacity = 1024 * 1024,
                     const float load_factor = 2.0, const uint32_t nlocks = 256)
      : nbuckets_(load_factor ? capacity / load_factor : 0), nlocks_(nlocks) {
    // pre-conditions
    assert(capacity);
    assert(load_factor);
    assert(nbuckets_);
    assert(nlocks_);

    buckets_.reset(new Bucket[nbuckets_]);
    mlock(buckets_.get(), nbuckets_ * sizeof(Bucket));

    // initialize locks
    locks_.reset(new port::RWMutex[nlocks_]);
    mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex));

    // post-conditions
    assert(buckets_);
    assert(locks_);
  }

  virtual ~HashTable() { AssertEmptyBuckets(); }

  //
  // Insert given record to hash table
  //
  bool Insert(const T& t) {
    const uint64_t h = Hash()(t);
    const uint32_t bucket_idx = h % nbuckets_;
    const uint32_t lock_idx = bucket_idx % nlocks_;

    WriteLock _(&locks_[lock_idx]);
    auto& bucket = buckets_[bucket_idx];
    return Insert(&bucket, t);
  }

  // Lookup hash table
  //
  // Please note that read lock should be held by the caller. This is because
  // the caller owns the data, and should hold the read lock as long as he
  // operates on the data.
  bool Find(const T& t, T* ret, port::RWMutex** ret_lock) {
    const uint64_t h = Hash()(t);
    const uint32_t bucket_idx = h % nbuckets_;
    const uint32_t lock_idx = bucket_idx % nlocks_;

    port::RWMutex& lock = locks_[lock_idx];
    lock.ReadLock();

    auto& bucket = buckets_[bucket_idx];
    if (Find(&bucket, t, ret)) {
      *ret_lock = &lock;
      return true;
    }

    lock.ReadUnlock();
    return false;
  }

  //
  // Erase a given key from the hash table
  //
  bool Erase(const T& t, T* ret) {
    const uint64_t h = Hash()(t);
    const uint32_t bucket_idx = h % nbuckets_;
    const uint32_t lock_idx = bucket_idx % nlocks_;

    WriteLock _(&locks_[lock_idx]);

    auto& bucket = buckets_[bucket_idx];
    return Erase(&bucket, t, ret);
  }

  // Fetch the mutex associated with a key
  // This call is used to hold the lock for a given data for extended period of
  // time.
  port::RWMutex* GetMutex(const T& t) {
    const uint64_t h = Hash()(t);
    const uint32_t bucket_idx = h % nbuckets_;
    const uint32_t lock_idx = bucket_idx % nlocks_;

    return &locks_[lock_idx];
  }

  void Clear(void (*fn)(T)) {
    for (uint32_t i = 0; i < nbuckets_; ++i) {
      const uint32_t lock_idx = i % nlocks_;
      WriteLock _(&locks_[lock_idx]);
      for (auto& t : buckets_[i].list_) {
        (*fn)(t);
      }
      buckets_[i].list_.clear();
    }
  }

 protected:
  // Models bucket of keys that hash to the same bucket number
  struct Bucket {
    std::list<T> list_;
  };

  // Substitute for std::find with custom comparator operator
  typename std::list<T>::iterator Find(std::list<T>* list, const T& t) {
    for (auto it = list->begin(); it != list->end(); ++it) {
      if (Equal()(*it, t)) {
        return it;
      }
    }
    return list->end();
  }

  bool Insert(Bucket* bucket, const T& t) {
    // Check if the key already exists
    auto it = Find(&bucket->list_, t);
    if (it != bucket->list_.end()) {
      return false;
    }

    // insert to bucket
    bucket->list_.push_back(t);
    return true;
  }

  bool Find(Bucket* bucket, const T& t, T* ret) {
    auto it = Find(&bucket->list_, t);
    if (it != bucket->list_.end()) {
      if (ret) {
        *ret = *it;
      }
      return true;
    }
    return false;
  }

  bool Erase(Bucket* bucket, const T& t, T* ret) {
    auto it = Find(&bucket->list_, t);
    if (it != bucket->list_.end()) {
      if (ret) {
        *ret = *it;
      }

      bucket->list_.erase(it);
      return true;
    }
    return false;
  }

  // assert that all buckets are empty
  void AssertEmptyBuckets() {
#ifndef NDEBUG
    for (size_t i = 0; i < nbuckets_; ++i) {
      WriteLock _(&locks_[i % nlocks_]);
      assert(buckets_[i].list_.empty());
    }
#endif
  }

  const uint32_t nbuckets_;                 // No. of buckets in the spine
  std::unique_ptr<Bucket[]> buckets_;       // Spine of the hash buckets
  const uint32_t nlocks_;                   // No. of locks
  std::unique_ptr<port::RWMutex[]> locks_;  // Granular locks
};

}  // namespace rocksdb

#endif
Persistent Read Cache (Part 2) Data structure for building persistent read cache index Summary: We expect the persistent read cache to perform at speeds upto 8 GB/s. In order to accomplish that, we need build a index mechanism which operate in the order of multiple millions per sec rate. This patch provide the basic data structure to accomplish that: (1) Hash table implementation with lock contention spread It is based on the StripedHashSet<T> implementation in The Art of multiprocessor programming by Maurice Henry & Nir Shavit (2) LRU implementation Place holder algorithm for further optimizing (3) Evictable Hash Table implementation Building block for building index data structure that evicts data like files etc TODO: (1) Figure if the sharded hash table and LRU can be used instead (2) Figure if we need to support configurable eviction algorithm for EvictableHashTable Test Plan: Run unit tests Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D55785 9 years ago			`// Copyright (c) 2013, Facebook, Inc. All rights reserved.`
			`// This source code is licensed under the BSD-style license found in the`
			`// LICENSE file in the root directory of this source tree. An additional grant`
			`// of patent rights can be found in the PATENTS file in the same directory.`
			`//`
			`#pragma once`

Disable lite build/testing for persistent read cache Summary: Persistent read cache isn't very applicable for lite builds. Wrapping the code with #ifndef ROCKSDB_LITE .. #endif Test Plan: Run unit, lite, lite_test Reviewers: sdong Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D58563 9 years ago			`#ifndef ROCKSDB_LITE`

Persistent Read Cache (Part 2) Data structure for building persistent read cache index Summary: We expect the persistent read cache to perform at speeds upto 8 GB/s. In order to accomplish that, we need build a index mechanism which operate in the order of multiple millions per sec rate. This patch provide the basic data structure to accomplish that: (1) Hash table implementation with lock contention spread It is based on the StripedHashSet<T> implementation in The Art of multiprocessor programming by Maurice Henry & Nir Shavit (2) LRU implementation Place holder algorithm for further optimizing (3) Evictable Hash Table implementation Building block for building index data structure that evicts data like files etc TODO: (1) Figure if the sharded hash table and LRU can be used instead (2) Figure if we need to support configurable eviction algorithm for EvictableHashTable Test Plan: Run unit tests Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D55785 9 years ago			`#include <assert.h>`
			`#include <sys/mman.h>`
			`#include <list>`
			`#include <vector>`

			`#include "include/rocksdb/env.h"`
			`#include "port/port_posix.h"`
			`#include "util/mutexlock.h"`

			`namespace rocksdb {`

			`// HashTable<T, Hash, Equal>`
			`//`
			`// Traditional implementation of hash table with syncronization built on top`
			`// don't perform very well in multi-core scenarios. This is an implementation`
			`// designed for multi-core scenarios with high lock contention.`
			`//`
			`// \|<-------- alpha ------------->\|`
			`// Buckets Collision list`
			`// ---- +----+ +---+---+--- ...... ---+---+---+`
			`// / \| \|--->\| \| \| \| \| \|`
			`// / +----+ +---+---+--- ...... ---+---+---+`
			`// / \| \|`
			`// Locks/ +----+`
			`// +--+/ . .`
			`// \| \| . .`
			`// +--+ . .`
			`// \| \| . .`
			`// +--+ . .`
			`// \| \| . .`
			`// +--+ . .`
			`// \ +----+`
			`// \ \| \|`
			`// \ +----+`
			`// \ \| \|`
			`// \---- +----+`
			`//`
			`// The lock contention is spread over an array of locks. This helps improve`
			`// concurrent access. The spine is designed for a certain capacity and load`
			`// factor. When the capacity planning is done correctly we can expect`
			`// O(load_factor = 1) insert, access and remove time.`
			`//`
			`// Micro benchmark on debug build gives about .5 Million/sec rate of insert,`
			`// erase and lookup in parallel (total of about 1.5 Million ops/sec). If the`
			`// blocks were of 4K, the hash table can support a virtual throughput of`
			`// 6 GB/s.`
			`//`
			`// T Object type (contains both key and value)`
			`// Hash Function that returns an hash from type T`
			`// Equal Returns if two objects are equal`
			`// (We need explicit equal for pointer type)`
			`//`
			`template <class T, class Hash, class Equal>`
			`class HashTable {`
			`public:`
			`explicit HashTable(const size_t capacity = 1024 * 1024,`
			`const float load_factor = 2.0, const uint32_t nlocks = 256)`
			`: nbuckets_(load_factor ? capacity / load_factor : 0), nlocks_(nlocks) {`
			`// pre-conditions`
			`assert(capacity);`
			`assert(load_factor);`
			`assert(nbuckets_);`
			`assert(nlocks_);`

			`buckets_.reset(new Bucket[nbuckets_]);`
			`mlock(buckets_.get(), nbuckets_ * sizeof(Bucket));`

			`// initialize locks`
			`locks_.reset(new port::RWMutex[nlocks_]);`
			`mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex));`

			`// post-conditions`
			`assert(buckets_);`
			`assert(locks_);`
			`}`

			`virtual ~HashTable() { AssertEmptyBuckets(); }`

			`//`
			`// Insert given record to hash table`
			`//`
			`bool Insert(const T& t) {`
			`const uint64_t h = Hash()(t);`
			`const uint32_t bucket_idx = h % nbuckets_;`
			`const uint32_t lock_idx = bucket_idx % nlocks_;`

			`WriteLock _(&locks_[lock_idx]);`
			`auto& bucket = buckets_[bucket_idx];`
			`return Insert(&bucket, t);`
			`}`

			`// Lookup hash table`
			`//`
			`// Please note that read lock should be held by the caller. This is because`
			`// the caller owns the data, and should hold the read lock as long as he`
			`// operates on the data.`
			`bool Find(const T& t, T* ret, port::RWMutex** ret_lock) {`
			`const uint64_t h = Hash()(t);`
			`const uint32_t bucket_idx = h % nbuckets_;`
			`const uint32_t lock_idx = bucket_idx % nlocks_;`

			`port::RWMutex& lock = locks_[lock_idx];`
			`lock.ReadLock();`

			`auto& bucket = buckets_[bucket_idx];`
			`if (Find(&bucket, t, ret)) {`
			`*ret_lock = &lock;`
			`return true;`
			`}`

			`lock.ReadUnlock();`
			`return false;`
			`}`

			`//`
			`// Erase a given key from the hash table`
			`//`
			`bool Erase(const T& t, T* ret) {`
			`const uint64_t h = Hash()(t);`
			`const uint32_t bucket_idx = h % nbuckets_;`
			`const uint32_t lock_idx = bucket_idx % nlocks_;`

			`WriteLock _(&locks_[lock_idx]);`

			`auto& bucket = buckets_[bucket_idx];`
			`return Erase(&bucket, t, ret);`
			`}`

			`// Fetch the mutex associated with a key`
			`// This call is used to hold the lock for a given data for extended period of`
			`// time.`
			`port::RWMutex* GetMutex(const T& t) {`
			`const uint64_t h = Hash()(t);`
			`const uint32_t bucket_idx = h % nbuckets_;`
			`const uint32_t lock_idx = bucket_idx % nlocks_;`

			`return &locks_[lock_idx];`
			`}`

			`void Clear(void (*fn)(T)) {`
			`for (uint32_t i = 0; i < nbuckets_; ++i) {`
			`const uint32_t lock_idx = i % nlocks_;`
			`WriteLock _(&locks_[lock_idx]);`
			`for (auto& t : buckets_[i].list_) {`
			`(*fn)(t);`
			`}`
			`buckets_[i].list_.clear();`
			`}`
			`}`

			`protected:`
			`// Models bucket of keys that hash to the same bucket number`
			`struct Bucket {`
			`std::list<T> list_;`
			`};`

			`// Substitute for std::find with custom comparator operator`
			`typename std::list<T>::iterator Find(std::list<T>* list, const T& t) {`
			`for (auto it = list->begin(); it != list->end(); ++it) {`
			`if (Equal()(*it, t)) {`
			`return it;`
			`}`
			`}`
			`return list->end();`
			`}`

			`bool Insert(Bucket* bucket, const T& t) {`
			`// Check if the key already exists`
			`auto it = Find(&bucket->list_, t);`
			`if (it != bucket->list_.end()) {`
			`return false;`
			`}`

			`// insert to bucket`
			`bucket->list_.push_back(t);`
			`return true;`
			`}`

			`bool Find(Bucket* bucket, const T& t, T* ret) {`
			`auto it = Find(&bucket->list_, t);`
			`if (it != bucket->list_.end()) {`
			`if (ret) {`
			`ret = it;`
			`}`
			`return true;`
			`}`
			`return false;`
			`}`

			`bool Erase(Bucket* bucket, const T& t, T* ret) {`
			`auto it = Find(&bucket->list_, t);`
			`if (it != bucket->list_.end()) {`
			`if (ret) {`
			`ret = it;`
			`}`

			`bucket->list_.erase(it);`
			`return true;`
			`}`
			`return false;`
			`}`

			`// assert that all buckets are empty`
			`void AssertEmptyBuckets() {`
			`#ifndef NDEBUG`
			`for (size_t i = 0; i < nbuckets_; ++i) {`
			`WriteLock _(&locks_[i % nlocks_]);`
			`assert(buckets_[i].list_.empty());`
			`}`
			`#endif`
			`}`

			`const uint32_t nbuckets_; // No. of buckets in the spine`
			`std::unique_ptr<Bucket[]> buckets_; // Spine of the hash buckets`
			`const uint32_t nlocks_; // No. of locks`
			`std::unique_ptr<port::RWMutex[]> locks_; // Granular locks`
			`};`

			`} // namespace rocksdb`
Disable lite build/testing for persistent read cache Summary: Persistent read cache isn't very applicable for lite builds. Wrapping the code with #ifndef ROCKSDB_LITE .. #endif Test Plan: Run unit, lite, lite_test Reviewers: sdong Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D58563 9 years ago
			`#endif`