Summary: Cuckoo Hashing based SST table builder. Contains: - Cuckoo Hashing logic and file storage logic. - Unit tests for logic Test Plan: make cuckoo_table_builder_test ./cuckoo_table_builder_test make check all Reviewers: yhchiang, igor, sdong, ljin Reviewed By: ljin Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D19545main
parent
f6f1533c6f
commit
cf3da899b0
@ -0,0 +1,333 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef ROCKSDB_LITE |
||||
#include "table/cuckoo_table_builder.h" |
||||
|
||||
#include <assert.h> |
||||
#include <algorithm> |
||||
#include <string> |
||||
#include <vector> |
||||
|
||||
#include "db/dbformat.h" |
||||
#include "rocksdb/env.h" |
||||
#include "rocksdb/table.h" |
||||
#include "table/block_builder.h" |
||||
#include "table/format.h" |
||||
#include "table/meta_blocks.h" |
||||
#include "util/autovector.h" |
||||
#include "util/random.h" |
||||
|
||||
namespace rocksdb { |
||||
const std::string CuckooTablePropertyNames::kEmptyBucket = |
||||
"rocksdb.cuckoo.bucket.empty.bucket"; |
||||
const std::string CuckooTablePropertyNames::kNumHashTable = |
||||
"rocksdb.cuckoo.hash.num"; |
||||
const std::string CuckooTablePropertyNames::kMaxNumBuckets = |
||||
"rocksdb.cuckoo.bucket.maxnum"; |
||||
|
||||
// Obtained by running echo rocksdb.table.cuckoo | sha1sum
|
||||
extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull; |
||||
|
||||
CuckooTableBuilder::CuckooTableBuilder( |
||||
WritableFile* file, unsigned int fixed_key_length, |
||||
unsigned int fixed_value_length, double hash_table_ratio, |
||||
unsigned int file_size, unsigned int max_num_hash_table, |
||||
unsigned int max_search_depth, |
||||
unsigned int (*GetSliceHashPtr)(const Slice&, unsigned int, |
||||
unsigned int)) |
||||
: num_hash_table_(std::min((unsigned int) 4, max_num_hash_table)), |
||||
file_(file), |
||||
key_length_(fixed_key_length), |
||||
value_length_(fixed_value_length), |
||||
bucket_size_(fixed_key_length + fixed_value_length), |
||||
hash_table_ratio_(hash_table_ratio), |
||||
max_num_buckets_(file_size / bucket_size_), |
||||
max_num_hash_table_(max_num_hash_table), |
||||
max_search_depth_(max_search_depth), |
||||
buckets_(max_num_buckets_), |
||||
GetSliceHash(GetSliceHashPtr) { |
||||
// The bucket_size is currently not optimized for last level.
|
||||
// In last level, the bucket will not contain full key.
|
||||
// TODO(rbs): Find how we can determine if last level or not
|
||||
// before we start adding entries into the table.
|
||||
properties_.num_entries = 0; |
||||
// Data is in a huge block.
|
||||
properties_.num_data_blocks = 1; |
||||
properties_.index_size = 0; |
||||
properties_.filter_size = 0; |
||||
} |
||||
|
||||
CuckooTableBuilder::~CuckooTableBuilder() { |
||||
} |
||||
|
||||
void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { |
||||
if (NumEntries() == max_num_buckets_) { |
||||
status_ = Status::Corruption("Hash Table is full."); |
||||
return; |
||||
} |
||||
unsigned int bucket_id; |
||||
bool bucket_found = false; |
||||
autovector<unsigned int> hash_vals; |
||||
ParsedInternalKey ikey; |
||||
if (!ParseInternalKey(key, &ikey)) { |
||||
status_ = Status::Corruption("Unable to parse key into inernal key."); |
||||
return; |
||||
} |
||||
Slice user_key = ikey.user_key; |
||||
for (unsigned int hash_cnt = 0; hash_cnt < num_hash_table_; ++hash_cnt) { |
||||
unsigned int hash_val = GetSliceHash(user_key, hash_cnt, max_num_buckets_); |
||||
if (buckets_[hash_val].is_empty) { |
||||
bucket_id = hash_val; |
||||
bucket_found = true; |
||||
break; |
||||
} else { |
||||
if (user_key.compare(ExtractUserKey(buckets_[hash_val].key)) == 0) { |
||||
status_ = Status::Corruption("Same key is being inserted again."); |
||||
return; |
||||
} |
||||
hash_vals.push_back(hash_val); |
||||
} |
||||
} |
||||
while (!bucket_found && !MakeSpaceForKey(key, &bucket_id, hash_vals)) { |
||||
// Rehash by increashing number of hash tables.
|
||||
if (num_hash_table_ >= max_num_hash_table_) { |
||||
status_ = Status::Corruption("Too many collissions. Unable to hash."); |
||||
return; |
||||
} |
||||
// We don't really need to rehash the entire table because old hashes are
|
||||
// still valid and we only increased the number of hash functions.
|
||||
unsigned int old_num_hash = num_hash_table_; |
||||
num_hash_table_ = std::min(num_hash_table_ + 1, max_num_hash_table_); |
||||
for (unsigned int i = old_num_hash; i < num_hash_table_; i++) { |
||||
unsigned int hash_val = GetSliceHash(user_key, i, max_num_buckets_); |
||||
if (buckets_[hash_val].is_empty) { |
||||
bucket_found = true; |
||||
bucket_id = hash_val; |
||||
break; |
||||
} else { |
||||
hash_vals.push_back(hash_val); |
||||
} |
||||
} |
||||
} |
||||
buckets_[bucket_id].key = key; |
||||
buckets_[bucket_id].value = value; |
||||
buckets_[bucket_id].is_empty = false; |
||||
|
||||
if (ikey.sequence != 0) { |
||||
// This is not a last level file.
|
||||
is_last_level_file_ = false; |
||||
} |
||||
properties_.num_entries++; |
||||
|
||||
// We assume that the keys are inserted in sorted order. To identify an
|
||||
// unused key, which will be used in filling empty buckets in the table,
|
||||
// we try to find gaps between successive keys inserted. This is done by
|
||||
// maintaining the previous key and comparing it with next key.
|
||||
if (unused_user_key_.empty()) { |
||||
if (prev_key_.empty()) { |
||||
prev_key_ = user_key.ToString(); |
||||
return; |
||||
} |
||||
std::string new_user_key = prev_key_; |
||||
new_user_key.back()++; |
||||
// We ignore carry-overs and check that it is larger than previous key.
|
||||
if ((new_user_key > prev_key_) && |
||||
(new_user_key < user_key.ToString())) { |
||||
unused_user_key_ = new_user_key; |
||||
} else { |
||||
prev_key_ = user_key.ToString(); |
||||
} |
||||
} |
||||
} |
||||
|
||||
Status CuckooTableBuilder::status() const { return status_; } |
||||
|
||||
Status CuckooTableBuilder::Finish() { |
||||
assert(!closed_); |
||||
closed_ = true; |
||||
|
||||
if (unused_user_key_.empty()) { |
||||
if (prev_key_.empty()) { |
||||
return Status::Corruption("Unable to find unused key"); |
||||
} |
||||
std::string new_user_key = prev_key_; |
||||
new_user_key.back()++; |
||||
// We ignore carry-overs and check that it is larger than previous key.
|
||||
if (new_user_key > prev_key_) { |
||||
unused_user_key_ = new_user_key; |
||||
} else { |
||||
return Status::Corruption("Unable to find unused key"); |
||||
} |
||||
} |
||||
std::string unused_bucket; |
||||
if (is_last_level_file_) { |
||||
unused_bucket = unused_user_key_; |
||||
} else { |
||||
ParsedInternalKey ikey(unused_user_key_, 0, kTypeValue); |
||||
AppendInternalKey(&unused_bucket, ikey); |
||||
} |
||||
properties_.fixed_key_len = unused_bucket.size(); |
||||
unsigned int bucket_size = unused_bucket.size() + value_length_; |
||||
// Resize to bucket size.
|
||||
unused_bucket.resize(bucket_size, 'a'); |
||||
|
||||
// Write the table.
|
||||
for (auto& bucket : buckets_) { |
||||
Status s; |
||||
if (bucket.is_empty) { |
||||
s = file_->Append(Slice(unused_bucket)); |
||||
} else { |
||||
if (is_last_level_file_) { |
||||
Slice user_key = ExtractUserKey(bucket.key); |
||||
s = file_->Append(user_key); |
||||
if (s.ok()) { |
||||
s = file_->Append(bucket.value); |
||||
} |
||||
} else { |
||||
s = file_->Append(bucket.key); |
||||
if (s.ok()) { |
||||
s = file_->Append(bucket.value); |
||||
} |
||||
} |
||||
} |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
} |
||||
|
||||
unsigned int offset = buckets_.size() * bucket_size; |
||||
properties_.user_collected_properties[ |
||||
CuckooTablePropertyNames::kEmptyBucket] = unused_bucket; |
||||
properties_.user_collected_properties[ |
||||
CuckooTablePropertyNames::kNumHashTable] = std::to_string(num_hash_table_); |
||||
PutVarint32(&properties_.user_collected_properties[ |
||||
CuckooTablePropertyNames::kMaxNumBuckets], max_num_buckets_); |
||||
|
||||
// Write meta blocks.
|
||||
MetaIndexBuilder meta_index_builer; |
||||
PropertyBlockBuilder property_block_builder; |
||||
|
||||
property_block_builder.AddTableProperty(properties_); |
||||
property_block_builder.Add(properties_.user_collected_properties); |
||||
Slice property_block = property_block_builder.Finish(); |
||||
BlockHandle property_block_handle; |
||||
property_block_handle.set_offset(offset); |
||||
property_block_handle.set_size(property_block.size()); |
||||
Status s = file_->Append(property_block); |
||||
offset += property_block.size(); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
|
||||
meta_index_builer.Add(kPropertiesBlock, property_block_handle); |
||||
Slice meta_index_block = meta_index_builer.Finish(); |
||||
|
||||
BlockHandle meta_index_block_handle; |
||||
meta_index_block_handle.set_offset(offset); |
||||
meta_index_block_handle.set_size(meta_index_block.size()); |
||||
s = file_->Append(meta_index_block); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
|
||||
Footer footer(kCuckooTableMagicNumber); |
||||
footer.set_metaindex_handle(meta_index_block_handle); |
||||
footer.set_index_handle(BlockHandle::NullBlockHandle()); |
||||
std::string footer_encoding; |
||||
footer.EncodeTo(&footer_encoding); |
||||
s = file_->Append(footer_encoding); |
||||
return s; |
||||
} |
||||
|
||||
void CuckooTableBuilder::Abandon() { |
||||
assert(!closed_); |
||||
closed_ = true; |
||||
} |
||||
|
||||
uint64_t CuckooTableBuilder::NumEntries() const { |
||||
return properties_.num_entries; |
||||
} |
||||
|
||||
uint64_t CuckooTableBuilder::FileSize() const { |
||||
if (closed_) { |
||||
return file_->GetFileSize(); |
||||
} else { |
||||
// This is not the actual size of the file as we need to account for
|
||||
// hash table ratio. This returns the size of filled buckets in the table
|
||||
// scaled up by a factor of 1/hash table ratio.
|
||||
return (properties_.num_entries * bucket_size_) / hash_table_ratio_; |
||||
} |
||||
} |
||||
|
||||
bool CuckooTableBuilder::MakeSpaceForKey(const Slice& key, |
||||
unsigned int *bucket_id, autovector<unsigned int> hash_vals) { |
||||
struct CuckooNode { |
||||
unsigned int bucket_id; |
||||
unsigned int depth; |
||||
int parent_pos; |
||||
CuckooNode(unsigned int bucket_id, unsigned int depth, int parent_pos) |
||||
: bucket_id(bucket_id), depth(depth), parent_pos(parent_pos) {} |
||||
}; |
||||
// This is BFS search tree that is stored simply as a vector.
|
||||
// Each node stores the index of parent node in the vector.
|
||||
std::vector<CuckooNode> tree; |
||||
// This is a very bad way to keep track of visited nodes.
|
||||
// TODO(rbs): Change this by adding a 'GetKeyPathId' field to the bucket
|
||||
// and use it to track visited nodes.
|
||||
std::vector<bool> buckets_visited(max_num_buckets_, false); |
||||
for (unsigned int hash_cnt = 0; hash_cnt < num_hash_table_; ++hash_cnt) { |
||||
unsigned int bucket_id = hash_vals[hash_cnt]; |
||||
buckets_visited[bucket_id] = true; |
||||
tree.push_back(CuckooNode(bucket_id, 0, -1)); |
||||
} |
||||
bool null_found = false; |
||||
unsigned int curr_pos = 0; |
||||
while (!null_found && curr_pos < tree.size()) { |
||||
CuckooNode& curr_node = tree[curr_pos]; |
||||
if (curr_node.depth >= max_search_depth_) { |
||||
break; |
||||
} |
||||
CuckooBucket& curr_bucket = buckets_[curr_node.bucket_id]; |
||||
for (unsigned int hash_cnt = 0; hash_cnt < num_hash_table_; ++hash_cnt) { |
||||
unsigned int child_bucket_id = GetSliceHash( |
||||
ExtractUserKey(curr_bucket.key), hash_cnt, max_num_buckets_); |
||||
if (child_bucket_id == curr_node.bucket_id) { |
||||
continue; |
||||
} |
||||
if (buckets_visited[child_bucket_id]) { |
||||
continue; |
||||
} |
||||
buckets_visited[child_bucket_id] = true; |
||||
tree.push_back(CuckooNode(child_bucket_id, curr_node.depth + 1, |
||||
curr_pos)); |
||||
if (buckets_[child_bucket_id].is_empty) { |
||||
null_found = true; |
||||
break; |
||||
} |
||||
} |
||||
++curr_pos; |
||||
} |
||||
|
||||
if (null_found) { |
||||
int bucket_to_replace_pos = tree.size()-1; |
||||
while (bucket_to_replace_pos >= 0) { |
||||
CuckooNode& curr_node = tree[bucket_to_replace_pos]; |
||||
if (curr_node.parent_pos != -1) { |
||||
buckets_[curr_node.bucket_id] = buckets_[curr_node.parent_pos]; |
||||
bucket_to_replace_pos = curr_node.parent_pos; |
||||
} else { |
||||
*bucket_id = curr_node.bucket_id; |
||||
return true; |
||||
} |
||||
} |
||||
return true; |
||||
} else { |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,97 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
#ifndef ROCKSDB_LITE |
||||
#include <stdint.h> |
||||
#include <string> |
||||
#include <vector> |
||||
#include "rocksdb/status.h" |
||||
#include "table/table_builder.h" |
||||
#include "rocksdb/table.h" |
||||
#include "rocksdb/table_properties.h" |
||||
#include "util/autovector.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
struct CuckooBucket { |
||||
CuckooBucket(): is_empty(true) {} |
||||
Slice key; |
||||
Slice value; |
||||
bool is_empty; |
||||
}; |
||||
|
||||
class CuckooTableBuilder: public TableBuilder { |
||||
public: |
||||
CuckooTableBuilder( |
||||
WritableFile* file, unsigned int fixed_key_length, |
||||
unsigned int fixed_value_length, double hash_table_ratio, |
||||
unsigned int file_size, unsigned int max_num_hash_table, |
||||
unsigned int max_search_depth, |
||||
unsigned int (*GetSliceHash)(const Slice&, unsigned int, |
||||
unsigned int)); |
||||
|
||||
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||
~CuckooTableBuilder(); |
||||
|
||||
// Add key,value to the table being constructed.
|
||||
// REQUIRES: key is after any previously added key according to comparator.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
void Add(const Slice& key, const Slice& value) override; |
||||
|
||||
// Return non-ok iff some error has been detected.
|
||||
Status status() const override; |
||||
|
||||
// Finish building the table. Stops using the file passed to the
|
||||
// constructor after this function returns.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
Status Finish() override; |
||||
|
||||
// Indicate that the contents of this builder should be abandoned. Stops
|
||||
// using the file passed to the constructor after this function returns.
|
||||
// If the caller is not going to call Finish(), it must call Abandon()
|
||||
// before destroying this builder.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
void Abandon() override; |
||||
|
||||
// Number of calls to Add() so far.
|
||||
uint64_t NumEntries() const override; |
||||
|
||||
// Size of the file generated so far. If invoked after a successful
|
||||
// Finish() call, returns the size of the final generated file.
|
||||
uint64_t FileSize() const override; |
||||
|
||||
private: |
||||
bool MakeSpaceForKey(const Slice& key, unsigned int* bucket_id, |
||||
autovector<unsigned int> hash_vals); |
||||
|
||||
unsigned int num_hash_table_; |
||||
WritableFile* file_; |
||||
const unsigned int key_length_; |
||||
const unsigned int value_length_; |
||||
const unsigned int bucket_size_; |
||||
const double hash_table_ratio_; |
||||
const unsigned int max_num_buckets_; |
||||
const unsigned int max_num_hash_table_; |
||||
const unsigned int max_search_depth_; |
||||
Status status_; |
||||
std::vector<CuckooBucket> buckets_; |
||||
bool is_last_level_file_ = true; |
||||
TableProperties properties_; |
||||
unsigned int (*GetSliceHash)(const Slice& s, unsigned int index, |
||||
unsigned int max_num_buckets); |
||||
std::string unused_user_key_ = ""; |
||||
std::string prev_key_; |
||||
|
||||
bool closed_ = false; // Either Finish() or Abandon() has been called.
|
||||
|
||||
// No copying allowed
|
||||
CuckooTableBuilder(const CuckooTableBuilder&) = delete; |
||||
void operator=(const CuckooTableBuilder&) = delete; |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
@ -0,0 +1,468 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include <vector> |
||||
#include <string> |
||||
#include <map> |
||||
#include <utility> |
||||
|
||||
#include "table/meta_blocks.h" |
||||
#include "table/cuckoo_table_builder.h" |
||||
#include "util/random.h" |
||||
#include "util/testharness.h" |
||||
#include "util/testutil.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
extern const uint64_t kCuckooTableMagicNumber; |
||||
|
||||
namespace { |
||||
std::unordered_map<std::string, std::vector<unsigned int>> hash_map; |
||||
|
||||
void AddHashLookups(const std::string& s, unsigned int bucket_id, |
||||
unsigned int num_hash_fun) { |
||||
std::vector<unsigned int> v; |
||||
for (unsigned int i = 0; i < num_hash_fun; i++) { |
||||
v.push_back(bucket_id + i); |
||||
} |
||||
hash_map[s] = v; |
||||
return; |
||||
} |
||||
|
||||
unsigned int GetSliceHash(const Slice& s, unsigned int index, |
||||
unsigned int max_num_buckets) { |
||||
return hash_map[s.ToString()][index]; |
||||
} |
||||
} // namespace
|
||||
|
||||
class CuckooBuilderTest { |
||||
public: |
||||
CuckooBuilderTest() { |
||||
env_ = Env::Default(); |
||||
} |
||||
|
||||
void CheckFileContents(const std::string& expected_data) { |
||||
// Read file
|
||||
unique_ptr<RandomAccessFile> read_file; |
||||
ASSERT_OK(env_->NewRandomAccessFile(fname, &read_file, env_options_)); |
||||
uint64_t read_file_size; |
||||
ASSERT_OK(env_->GetFileSize(fname, &read_file_size)); |
||||
|
||||
// Assert Table Properties.
|
||||
TableProperties* props = nullptr; |
||||
ASSERT_OK(ReadTableProperties(read_file.get(), read_file_size, |
||||
kCuckooTableMagicNumber, env_, nullptr, &props)); |
||||
ASSERT_EQ(props->num_entries, num_items); |
||||
ASSERT_EQ(props->fixed_key_len, key_length); |
||||
|
||||
// Check unused bucket.
|
||||
std::string unused_bucket = props->user_collected_properties[ |
||||
CuckooTablePropertyNames::kEmptyBucket]; |
||||
ASSERT_EQ(expected_unused_bucket, unused_bucket); |
||||
|
||||
unsigned int max_buckets; |
||||
Slice max_buckets_slice = Slice(props->user_collected_properties[ |
||||
CuckooTablePropertyNames::kMaxNumBuckets]); |
||||
GetVarint32(&max_buckets_slice, &max_buckets); |
||||
ASSERT_EQ(expected_max_buckets, max_buckets); |
||||
// Check contents of the bucket.
|
||||
std::string read_data; |
||||
read_data.resize(expected_data.size()); |
||||
Slice read_slice; |
||||
ASSERT_OK(read_file->Read(0, expected_data.size(), |
||||
&read_slice, &read_data[0])); |
||||
ASSERT_EQ(expected_data, read_data); |
||||
} |
||||
|
||||
Env* env_; |
||||
const EnvOptions env_options_; |
||||
std::string fname; |
||||
std::string expected_unused_bucket; |
||||
unsigned int file_size = 100000; |
||||
unsigned int num_items = 20; |
||||
unsigned int num_hash_fun = 64; |
||||
double hash_table_ratio = 0.9; |
||||
unsigned int ikey_length; |
||||
unsigned int user_key_length; |
||||
unsigned int key_length; |
||||
unsigned int value_length; |
||||
unsigned int bucket_length; |
||||
unsigned int expected_max_buckets; |
||||
}; |
||||
|
||||
|
||||
TEST(CuckooBuilderTest, NoCollision) { |
||||
hash_map.clear(); |
||||
num_items = 20; |
||||
num_hash_fun = 64; |
||||
std::vector<std::string> user_keys(num_items); |
||||
std::vector<std::string> keys(num_items); |
||||
std::vector<std::string> values(num_items); |
||||
unsigned int bucket_ids = 0; |
||||
for (unsigned int i = 0; i < num_items; i++) { |
||||
user_keys[i] = "keys" + std::to_string(i+100); |
||||
ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); |
||||
AppendInternalKey(&keys[i], ikey); |
||||
values[i] = "value" + std::to_string(i+100); |
||||
AddHashLookups(user_keys[i], bucket_ids, num_hash_fun); |
||||
bucket_ids += num_hash_fun; |
||||
} |
||||
|
||||
ikey_length = keys[0].size(); |
||||
key_length = ikey_length; |
||||
value_length = values[0].size(); |
||||
bucket_length = ikey_length + value_length; |
||||
expected_max_buckets = file_size / bucket_length; |
||||
std::string expected_unused_user_key = "keys10:"; |
||||
ParsedInternalKey ikey(expected_unused_user_key, 0, kTypeValue); |
||||
AppendInternalKey(&expected_unused_bucket, ikey); |
||||
expected_unused_bucket.resize(bucket_length, 'a'); |
||||
unique_ptr<WritableFile> writable_file; |
||||
fname = test::TmpDir() + "/BasicTest_writable_file"; |
||||
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); |
||||
CuckooTableBuilder* cuckoo_builder = new CuckooTableBuilder( |
||||
writable_file.get(), ikey_length, |
||||
value_length, hash_table_ratio, |
||||
file_size, num_hash_fun, 100, GetSliceHash); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
unsigned int key_idx = 0; |
||||
std::string expected_file_data = ""; |
||||
for (unsigned int i = 0; i < expected_max_buckets; i++) { |
||||
if (key_idx * num_hash_fun == i && key_idx < num_items) { |
||||
cuckoo_builder->Add(Slice(keys[key_idx]), Slice(values[key_idx])); |
||||
ASSERT_EQ(cuckoo_builder->NumEntries(), key_idx + 1); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
expected_file_data.append(keys[key_idx] + values[key_idx]); |
||||
++key_idx; |
||||
} else { |
||||
expected_file_data.append(expected_unused_bucket); |
||||
} |
||||
} |
||||
ASSERT_OK(cuckoo_builder->Finish()); |
||||
writable_file->Close(); |
||||
CheckFileContents(expected_file_data); |
||||
} |
||||
|
||||
TEST(CuckooBuilderTest, NoCollisionLastLevel) { |
||||
hash_map.clear(); |
||||
std::vector<std::string> user_keys(num_items); |
||||
std::vector<std::string> keys(num_items); |
||||
std::vector<std::string> values(num_items); |
||||
unsigned int bucket_ids = 0; |
||||
for (unsigned int i = 0; i < num_items; i++) { |
||||
user_keys[i] = "keys" + std::to_string(i+100); |
||||
// Set zero sequence number in all keys.
|
||||
ParsedInternalKey ikey(user_keys[i], 0, kTypeValue); |
||||
AppendInternalKey(&keys[i], ikey); |
||||
values[i] = "value" + std::to_string(i+100); |
||||
AddHashLookups(user_keys[i], bucket_ids, num_hash_fun); |
||||
bucket_ids += num_hash_fun; |
||||
} |
||||
ikey_length = keys[0].size(); |
||||
user_key_length = user_keys[0].size(); |
||||
key_length = user_key_length; |
||||
value_length = values[0].size(); |
||||
bucket_length = key_length + value_length; |
||||
expected_max_buckets = file_size / bucket_length; |
||||
expected_unused_bucket = "keys10:"; |
||||
expected_unused_bucket.resize(bucket_length, 'a'); |
||||
unique_ptr<WritableFile> writable_file; |
||||
fname = test::TmpDir() + "/NoCollisionLastLevel_writable_file"; |
||||
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); |
||||
CuckooTableBuilder* cuckoo_builder = new CuckooTableBuilder( |
||||
writable_file.get(), key_length, |
||||
value_length, hash_table_ratio, |
||||
file_size, num_hash_fun, 100, GetSliceHash); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
unsigned int key_idx = 0; |
||||
std::string expected_file_data = ""; |
||||
for (unsigned int i = 0; i < expected_max_buckets; i++) { |
||||
if (key_idx * num_hash_fun == i && key_idx < num_items) { |
||||
cuckoo_builder->Add(Slice(keys[key_idx]), Slice(values[key_idx])); |
||||
ASSERT_EQ(cuckoo_builder->NumEntries(), key_idx + 1); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
expected_file_data.append(user_keys[key_idx] + values[key_idx]); |
||||
++key_idx; |
||||
} else { |
||||
expected_file_data.append(expected_unused_bucket); |
||||
} |
||||
} |
||||
ASSERT_OK(cuckoo_builder->Finish()); |
||||
writable_file->Close(); |
||||
CheckFileContents(expected_file_data); |
||||
} |
||||
|
||||
TEST(CuckooBuilderTest, WithCollision) { |
||||
// Take keys with colliding hash function values.
|
||||
hash_map.clear(); |
||||
num_hash_fun = 20; |
||||
num_items = num_hash_fun; |
||||
std::vector<std::string> user_keys(num_items); |
||||
std::vector<std::string> keys(num_items); |
||||
std::vector<std::string> values(num_items); |
||||
for (unsigned int i = 0; i < num_items; i++) { |
||||
user_keys[i] = "keys" + std::to_string(i+100); |
||||
ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); |
||||
AppendInternalKey(&keys[i], ikey); |
||||
values[i] = "value" + std::to_string(i+100); |
||||
// Make all hash values collide.
|
||||
AddHashLookups(user_keys[i], 0, num_hash_fun); |
||||
} |
||||
ikey_length = keys[0].size(); |
||||
value_length = values[0].size(); |
||||
key_length = ikey_length; |
||||
bucket_length = key_length + value_length; |
||||
expected_max_buckets = file_size / bucket_length; |
||||
std::string expected_unused_user_key = "keys10:"; |
||||
ParsedInternalKey ikey(expected_unused_user_key, 0, kTypeValue); |
||||
AppendInternalKey(&expected_unused_bucket, ikey); |
||||
expected_unused_bucket.resize(bucket_length, 'a'); |
||||
unique_ptr<WritableFile> writable_file; |
||||
fname = test::TmpDir() + "/WithCollision_writable_file"; |
||||
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); |
||||
CuckooTableBuilder* cuckoo_builder = new CuckooTableBuilder( |
||||
writable_file.get(), key_length, value_length, hash_table_ratio, |
||||
file_size, num_hash_fun, 100, GetSliceHash); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
unsigned int key_idx = 0; |
||||
std::string expected_file_data = ""; |
||||
for (unsigned int i = 0; i < expected_max_buckets; i++) { |
||||
if (key_idx == i && key_idx < num_items) { |
||||
cuckoo_builder->Add(Slice(keys[key_idx]), Slice(values[key_idx])); |
||||
ASSERT_EQ(cuckoo_builder->NumEntries(), key_idx + 1); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
expected_file_data.append(keys[key_idx] + values[key_idx]); |
||||
++key_idx; |
||||
} else { |
||||
expected_file_data.append(expected_unused_bucket); |
||||
} |
||||
} |
||||
ASSERT_OK(cuckoo_builder->Finish()); |
||||
writable_file->Close(); |
||||
CheckFileContents(expected_file_data); |
||||
} |
||||
|
||||
TEST(CuckooBuilderTest, FailWithTooManyCollisions) { |
||||
// Take keys with colliding hash function values.
|
||||
// Take more keys than the number of hash functions.
|
||||
hash_map.clear(); |
||||
num_hash_fun = 20; |
||||
num_items = num_hash_fun + 1; |
||||
std::vector<std::string> user_keys(num_items); |
||||
std::vector<std::string> keys(num_items); |
||||
std::vector<std::string> values(num_items); |
||||
for (unsigned int i = 0; i < num_items; i++) { |
||||
user_keys[i] = "keys" + std::to_string(i+100); |
||||
ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); |
||||
AppendInternalKey(&keys[i], ikey); |
||||
values[i] = "value" + std::to_string(i+100); |
||||
// Make all hash values collide.
|
||||
AddHashLookups(user_keys[i], 0, num_hash_fun); |
||||
} |
||||
ikey_length = keys[0].size(); |
||||
value_length = values[0].size(); |
||||
unique_ptr<WritableFile> writable_file; |
||||
fname = test::TmpDir() + "/FailWithTooManyCollisions_writable"; |
||||
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); |
||||
CuckooTableBuilder* cuckoo_builder = new CuckooTableBuilder( |
||||
writable_file.get(), ikey_length, |
||||
value_length, hash_table_ratio, file_size, num_hash_fun, |
||||
100, GetSliceHash); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
for (unsigned int key_idx = 0; key_idx < num_items-1; key_idx++) { |
||||
cuckoo_builder->Add(Slice(keys[key_idx]), Slice(values[key_idx])); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
ASSERT_EQ(cuckoo_builder->NumEntries(), key_idx + 1); |
||||
} |
||||
cuckoo_builder->Add(Slice(keys.back()), Slice(values.back())); |
||||
ASSERT_TRUE(cuckoo_builder->status().IsCorruption()); |
||||
cuckoo_builder->Abandon(); |
||||
writable_file->Close(); |
||||
} |
||||
|
||||
TEST(CuckooBuilderTest, FailWhenSameKeyInserted) { |
||||
hash_map.clear(); |
||||
std::string user_key = "repeatedkey"; |
||||
AddHashLookups(user_key, 0, num_hash_fun); |
||||
std::string key_to_reuse1, key_to_reuse2; |
||||
ParsedInternalKey ikey1(user_key, 1000, kTypeValue); |
||||
ParsedInternalKey ikey2(user_key, 1001, kTypeValue); |
||||
AppendInternalKey(&key_to_reuse1, ikey1); |
||||
AppendInternalKey(&key_to_reuse2, ikey2); |
||||
std::string value = "value"; |
||||
ikey_length = key_to_reuse1.size(); |
||||
value_length = value.size(); |
||||
unique_ptr<WritableFile> writable_file; |
||||
fname = test::TmpDir() + "/FailWhenSameKeyInserted_writable_file"; |
||||
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); |
||||
CuckooTableBuilder* cuckoo_builder = new CuckooTableBuilder( |
||||
writable_file.get(), ikey_length, |
||||
value_length, hash_table_ratio, file_size, num_hash_fun, |
||||
100, GetSliceHash); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
cuckoo_builder->Add(Slice(key_to_reuse1), Slice(value)); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
ASSERT_EQ(cuckoo_builder->NumEntries(), 1); |
||||
cuckoo_builder->Add(Slice(key_to_reuse2), Slice(value)); |
||||
ASSERT_TRUE(cuckoo_builder->status().IsCorruption()); |
||||
cuckoo_builder->Abandon(); |
||||
writable_file->Close(); |
||||
} |
||||
|
||||
TEST(CuckooBuilderTest, WithACollisionPath) { |
||||
hash_map.clear(); |
||||
// Have two hash functions. Insert elements with overlapping hashes.
|
||||
// Finally insert an element which will displace all the current elements.
|
||||
num_hash_fun = 2; |
||||
|
||||
unsigned int max_search_depth = 100; |
||||
num_items = max_search_depth + 2; |
||||
std::vector<std::string> user_keys(num_items); |
||||
std::vector<std::string> keys(num_items); |
||||
std::vector<std::string> values(num_items); |
||||
std::vector<unsigned int> expected_bucket_id(num_items); |
||||
for (unsigned int i = 0; i < num_items - 1; i++) { |
||||
user_keys[i] = "keys" + std::to_string(i+100); |
||||
ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); |
||||
AppendInternalKey(&keys[i], ikey); |
||||
values[i] = "value" + std::to_string(i+100); |
||||
// Make all hash values collide with the next element.
|
||||
AddHashLookups(user_keys[i], i, num_hash_fun); |
||||
expected_bucket_id[i] = i+1; |
||||
} |
||||
expected_bucket_id[0] = 0; |
||||
user_keys.back() = "keys" + std::to_string(num_items + 99); |
||||
ParsedInternalKey ikey(user_keys.back(), num_items + 1000, kTypeValue); |
||||
AppendInternalKey(&keys.back(), ikey); |
||||
values.back() = "value" + std::to_string(num_items+100); |
||||
// Make both hash values collide with first element.
|
||||
AddHashLookups(user_keys.back(), 0, num_hash_fun); |
||||
expected_bucket_id.back() = 1; |
||||
|
||||
ikey_length = keys[0].size(); |
||||
value_length = values[0].size(); |
||||
key_length = ikey_length; |
||||
bucket_length = key_length + value_length; |
||||
|
||||
expected_max_buckets = file_size / bucket_length; |
||||
std::string expected_unused_user_key = "keys10:"; |
||||
ikey = ParsedInternalKey(expected_unused_user_key, 0, kTypeValue); |
||||
AppendInternalKey(&expected_unused_bucket, ikey); |
||||
expected_unused_bucket.resize(bucket_length, 'a'); |
||||
std::string expected_file_data = ""; |
||||
for (unsigned int i = 0; i < expected_max_buckets; i++) { |
||||
expected_file_data += expected_unused_bucket; |
||||
} |
||||
|
||||
unique_ptr<WritableFile> writable_file; |
||||
fname = test::TmpDir() + "/WithCollisionPath_writable_file"; |
||||
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); |
||||
CuckooTableBuilder* cuckoo_builder = new CuckooTableBuilder( |
||||
writable_file.get(), key_length, |
||||
value_length, hash_table_ratio, file_size, |
||||
num_hash_fun, max_search_depth, GetSliceHash); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
for (unsigned int key_idx = 0; key_idx < num_items; key_idx++) { |
||||
cuckoo_builder->Add(Slice(keys[key_idx]), Slice(values[key_idx])); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
ASSERT_EQ(cuckoo_builder->NumEntries(), key_idx + 1); |
||||
expected_file_data.replace(expected_bucket_id[key_idx]*bucket_length, |
||||
bucket_length, keys[key_idx] + values[key_idx]); |
||||
} |
||||
ASSERT_OK(cuckoo_builder->Finish()); |
||||
writable_file->Close(); |
||||
CheckFileContents(expected_file_data); |
||||
} |
||||
|
||||
TEST(CuckooBuilderTest, FailWhenCollisionPathTooLong) { |
||||
hash_map.clear(); |
||||
// Have two hash functions. Insert elements with overlapping hashes.
|
||||
// Finally insert an element which will displace all the current elements.
|
||||
num_hash_fun = 2; |
||||
|
||||
unsigned int max_search_depth = 100; |
||||
num_items = max_search_depth + 3; |
||||
std::vector<std::string> user_keys(num_items); |
||||
std::vector<std::string> keys(num_items); |
||||
std::vector<std::string> values(num_items); |
||||
for (unsigned int i = 0; i < num_items - 1; i++) { |
||||
user_keys[i] = "keys" + std::to_string(i+100); |
||||
ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); |
||||
AppendInternalKey(&keys[i], ikey); |
||||
values[i] = "value" + std::to_string(i+100); |
||||
// Make all hash values collide with the next element.
|
||||
AddHashLookups(user_keys[i], i, num_hash_fun); |
||||
} |
||||
user_keys.back() = "keys" + std::to_string(num_items + 99); |
||||
ParsedInternalKey ikey(user_keys.back(), num_items + 1000, kTypeValue); |
||||
AppendInternalKey(&keys.back(), ikey); |
||||
Slice(values.back()) = "value" + std::to_string(num_items+100); |
||||
// Make both hash values collide with first element.
|
||||
AddHashLookups(user_keys.back(), 0, num_hash_fun); |
||||
|
||||
ikey_length = keys[0].size(); |
||||
value_length = values[0].size(); |
||||
unique_ptr<WritableFile> writable_file; |
||||
fname = test::TmpDir() + "/FailWhenCollisionPathTooLong_writable"; |
||||
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); |
||||
CuckooTableBuilder* cuckoo_builder = new CuckooTableBuilder( |
||||
writable_file.get(), ikey_length, |
||||
value_length, hash_table_ratio, file_size, num_hash_fun, |
||||
max_search_depth, GetSliceHash); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
for (unsigned int key_idx = 0; key_idx < num_items-1; key_idx++) { |
||||
cuckoo_builder->Add(Slice(keys[key_idx]), Slice(values[key_idx])); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
ASSERT_EQ(cuckoo_builder->NumEntries(), key_idx + 1); |
||||
} |
||||
cuckoo_builder->Add(Slice(keys.back()), Slice(values.back())); |
||||
ASSERT_TRUE(cuckoo_builder->status().IsCorruption()); |
||||
cuckoo_builder->Abandon(); |
||||
writable_file->Close(); |
||||
} |
||||
|
||||
TEST(CuckooBuilderTest, FailWhenTableIsFull) { |
||||
hash_map.clear(); |
||||
file_size = 160; |
||||
|
||||
num_items = 7; |
||||
std::vector<std::string> user_keys(num_items); |
||||
std::vector<std::string> keys(num_items); |
||||
std::vector<std::string> values(num_items); |
||||
for (unsigned int i = 0; i < num_items; i++) { |
||||
user_keys[i] = "keys" + std::to_string(i+1000); |
||||
ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); |
||||
AppendInternalKey(&keys[i], ikey); |
||||
values[i] = "value" + std::to_string(i+100); |
||||
AddHashLookups(user_keys[i], i, num_hash_fun); |
||||
} |
||||
ikey_length = keys[0].size(); |
||||
value_length = values[0].size(); |
||||
bucket_length = ikey_length + value_length; |
||||
// Check that number of items is tight.
|
||||
ASSERT_GT(bucket_length * num_items, file_size); |
||||
ASSERT_LE(bucket_length * (num_items-1), file_size); |
||||
|
||||
unique_ptr<WritableFile> writable_file; |
||||
fname = test::TmpDir() + "/FailWhenTabelIsFull_writable"; |
||||
ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); |
||||
CuckooTableBuilder* cuckoo_builder = new CuckooTableBuilder( |
||||
writable_file.get(), ikey_length, |
||||
value_length, hash_table_ratio, file_size, num_hash_fun, |
||||
100, GetSliceHash); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
for (unsigned int key_idx = 0; key_idx < num_items-1; key_idx++) { |
||||
cuckoo_builder->Add(Slice(keys[key_idx]), Slice(values[key_idx])); |
||||
ASSERT_OK(cuckoo_builder->status()); |
||||
ASSERT_EQ(cuckoo_builder->NumEntries(), key_idx + 1); |
||||
} |
||||
cuckoo_builder->Add(Slice(keys.back()), Slice(values.back())); |
||||
ASSERT_TRUE(cuckoo_builder->status().IsCorruption()); |
||||
cuckoo_builder->Abandon(); |
||||
writable_file->Close(); |
||||
} |
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } |
@ -0,0 +1,32 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include "util/murmurhash.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
static const unsigned int kMaxNumHashTable = 64; |
||||
|
||||
unsigned int GetSliceMurmurHash(const Slice& s, unsigned int index, |
||||
unsigned int max_num_buckets) { |
||||
static constexpr unsigned int seeds[kMaxNumHashTable] = { |
||||
816922183, 506425713, 949485004, 22513986, 421427259, 500437285, |
||||
888981693, 847587269, 511007211, 722295391, 934013645, 566947683, |
||||
193618736, 428277388, 770956674, 819994962, 755946528, 40807421, |
||||
263144466, 241420041, 444294464, 731606396, 304158902, 563235655, |
||||
968740453, 336996831, 462831574, 407970157, 985877240, 637708754, |
||||
736932700, 205026023, 755371467, 729648411, 807744117, 46482135, |
||||
847092855, 620960699, 102476362, 314094354, 625838942, 550889395, |
||||
639071379, 834567510, 397667304, 151945969, 443634243, 196618243, |
||||
421986347, 407218337, 964502417, 327741231, 493359459, 452453139, |
||||
692216398, 108161624, 816246924, 234779764, 618949448, 496133787, |
||||
156374056, 316589799, 982915425, 553105889 }; |
||||
return MurmurHash(s.data(), s.size(), seeds[index]) % max_num_buckets; |
||||
} |
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
Loading…
Reference in new issue