Summary: Currently, the in-memory hash index of blockbased table uses a precise hash map to track the prefix to block range mapping. In some use cases, especially when prefix itself is big, the memory overhead becomes a problem. This diff introduces a fixed hash bucket array that does not store the prefix and allows prefix collision, which is similar to the plaintable hash index, in order to reduce the memory consumption. Just a quick draft, still testing and refining. Test Plan: unit test and shadow testing Reviewers: dhruba, kailiu, sdong Reviewed By: sdong Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D19047main
parent
3525aac9e5
commit
0f0076ed5a
@ -0,0 +1,242 @@ |
|||||||
|
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
#include "table/block_prefix_index.h" |
||||||
|
|
||||||
|
#include <vector> |
||||||
|
|
||||||
|
#include "rocksdb/comparator.h" |
||||||
|
#include "rocksdb/slice.h" |
||||||
|
#include "rocksdb/slice_transform.h" |
||||||
|
#include "util/arena.h" |
||||||
|
#include "util/coding.h" |
||||||
|
#include "util/hash.h" |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
|
||||||
|
namespace { |
||||||
|
|
||||||
|
inline uint32_t Hash(const Slice& s) { |
||||||
|
return rocksdb::Hash(s.data(), s.size(), 0); |
||||||
|
} |
||||||
|
|
||||||
|
inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) { |
||||||
|
return Hash(prefix) % num_buckets; |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// The prefix block index is simply a bucket array, with each entry pointing to
|
||||||
|
// the blocks that span the prefixes hashed to this bucket.
|
||||||
|
//
|
||||||
|
// To reduce memory footprint, if there is only one block per bucket, the entry
|
||||||
|
// stores the block id directly. If there are more than one blocks per bucket,
|
||||||
|
// because of hash collision or a single prefix spanning multiple blocks,
|
||||||
|
// the entry points to an array of block ids. The block array is an array of
|
||||||
|
// uint32_t's. The first uint32_t indicates the total number of blocks, followed
|
||||||
|
// by the block ids.
|
||||||
|
//
|
||||||
|
// To differentiate the two cases, the high order bit of the entry indicates
|
||||||
|
// whether it is a 'pointer' into a separate block array.
|
||||||
|
// 0x7FFFFFFF is reserved for empty bucket.
|
||||||
|
|
||||||
|
const uint32_t kNoneBlock = 0x7FFFFFFF; |
||||||
|
const uint32_t kBlockArrayMask = 0x80000000; |
||||||
|
|
||||||
|
inline bool IsNone(uint32_t block_id) { |
||||||
|
return block_id == kNoneBlock; |
||||||
|
} |
||||||
|
|
||||||
|
inline bool IsBlockId(uint32_t block_id) { |
||||||
|
return (block_id & kBlockArrayMask) == 0; |
||||||
|
} |
||||||
|
|
||||||
|
inline uint32_t DecodeIndex(uint32_t block_id) { |
||||||
|
uint32_t index = block_id ^ kBlockArrayMask; |
||||||
|
assert(index < kBlockArrayMask); |
||||||
|
return index; |
||||||
|
} |
||||||
|
|
||||||
|
inline uint32_t EncodeIndex(uint32_t index) { |
||||||
|
assert(index < kBlockArrayMask); |
||||||
|
return index | kBlockArrayMask; |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
// temporary storage for prefix information during index building
|
||||||
|
struct PrefixRecord { |
||||||
|
Slice prefix; |
||||||
|
uint32_t start_block; |
||||||
|
uint32_t end_block; |
||||||
|
uint32_t num_blocks; |
||||||
|
PrefixRecord* next; |
||||||
|
}; |
||||||
|
|
||||||
|
} // anonymous namespace
|
||||||
|
|
||||||
|
class BlockPrefixIndex::Builder { |
||||||
|
public: |
||||||
|
explicit Builder(const SliceTransform* internal_prefix_extractor) |
||||||
|
: internal_prefix_extractor_(internal_prefix_extractor) {} |
||||||
|
|
||||||
|
void Add(const Slice& key_prefix, uint32_t start_block, |
||||||
|
uint32_t num_blocks) { |
||||||
|
PrefixRecord* record = reinterpret_cast<PrefixRecord*>( |
||||||
|
arena_.AllocateAligned(sizeof(PrefixRecord))); |
||||||
|
record->prefix = key_prefix; |
||||||
|
record->start_block = start_block; |
||||||
|
record->end_block = start_block + num_blocks - 1; |
||||||
|
record->num_blocks = num_blocks; |
||||||
|
prefixes_.push_back(record); |
||||||
|
} |
||||||
|
|
||||||
|
BlockPrefixIndex* Finish() { |
||||||
|
// For now, use roughly 1:1 prefix to bucket ratio.
|
||||||
|
uint32_t num_buckets = prefixes_.size() + 1; |
||||||
|
|
||||||
|
// Collect prefix records that hash to the same bucket, into a single
|
||||||
|
// linklist.
|
||||||
|
std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr); |
||||||
|
std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0); |
||||||
|
for (PrefixRecord* current : prefixes_) { |
||||||
|
uint32_t bucket = PrefixToBucket(current->prefix, num_buckets); |
||||||
|
// merge the prefix block span if the first block of this prefix is
|
||||||
|
// connected to the last block of the previous prefix.
|
||||||
|
PrefixRecord* prev = prefixes_per_bucket[bucket]; |
||||||
|
if (prev) { |
||||||
|
assert(current->start_block >= prev->end_block); |
||||||
|
auto distance = current->start_block - prev->end_block; |
||||||
|
if (distance <= 1) { |
||||||
|
prev->end_block = current->end_block; |
||||||
|
prev->num_blocks = prev->end_block - prev->start_block + 1; |
||||||
|
continue; |
||||||
|
} |
||||||
|
} |
||||||
|
current->next = prev; |
||||||
|
prefixes_per_bucket[bucket] = current; |
||||||
|
num_blocks_per_bucket[bucket] += current->num_blocks; |
||||||
|
} |
||||||
|
|
||||||
|
// Calculate the block array buffer size
|
||||||
|
uint32_t total_block_array_entries = 0; |
||||||
|
for (uint32_t i = 0; i < num_buckets; i++) { |
||||||
|
uint32_t num_blocks = num_blocks_per_bucket[i]; |
||||||
|
if (num_blocks > 1) { |
||||||
|
total_block_array_entries += (num_blocks + 1); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Populate the final prefix block index
|
||||||
|
uint32_t* block_array_buffer = new uint32_t[total_block_array_entries]; |
||||||
|
uint32_t* buckets = new uint32_t[num_buckets]; |
||||||
|
uint32_t offset = 0; |
||||||
|
for (uint32_t i = 0; i < num_buckets; i++) { |
||||||
|
uint32_t num_blocks = num_blocks_per_bucket[i]; |
||||||
|
if (num_blocks == 0) { |
||||||
|
assert(prefixes_per_bucket[i] == nullptr); |
||||||
|
buckets[i] = kNoneBlock; |
||||||
|
} else if (num_blocks == 1) { |
||||||
|
assert(prefixes_per_bucket[i] != nullptr); |
||||||
|
assert(prefixes_per_bucket[i]->next == nullptr); |
||||||
|
buckets[i] = prefixes_per_bucket[i]->start_block; |
||||||
|
} else { |
||||||
|
assert(prefixes_per_bucket[i] != nullptr); |
||||||
|
buckets[i] = EncodeIndex(offset); |
||||||
|
block_array_buffer[offset] = num_blocks; |
||||||
|
uint32_t* last_block = &block_array_buffer[offset + num_blocks]; |
||||||
|
auto current = prefixes_per_bucket[i]; |
||||||
|
// populate block ids from largest to smallest
|
||||||
|
while (current != nullptr) { |
||||||
|
for (uint32_t i = 0; i < current->num_blocks; i++) { |
||||||
|
*last_block = current->end_block - i; |
||||||
|
last_block--; |
||||||
|
} |
||||||
|
current = current->next; |
||||||
|
} |
||||||
|
assert(last_block == &block_array_buffer[offset]); |
||||||
|
offset += (num_blocks + 1); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
assert(offset == total_block_array_entries); |
||||||
|
|
||||||
|
return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets, |
||||||
|
buckets, total_block_array_entries, |
||||||
|
block_array_buffer); |
||||||
|
} |
||||||
|
|
||||||
|
private: |
||||||
|
const SliceTransform* internal_prefix_extractor_; |
||||||
|
|
||||||
|
std::vector<PrefixRecord*> prefixes_; |
||||||
|
Arena arena_; |
||||||
|
}; |
||||||
|
|
||||||
|
|
||||||
|
Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor, |
||||||
|
const Slice& prefixes, const Slice& prefix_meta, |
||||||
|
BlockPrefixIndex** prefix_index) { |
||||||
|
uint64_t pos = 0; |
||||||
|
auto meta_pos = prefix_meta; |
||||||
|
Status s; |
||||||
|
Builder builder(internal_prefix_extractor); |
||||||
|
|
||||||
|
while (!meta_pos.empty()) { |
||||||
|
uint32_t prefix_size = 0; |
||||||
|
uint32_t entry_index = 0; |
||||||
|
uint32_t num_blocks = 0; |
||||||
|
if (!GetVarint32(&meta_pos, &prefix_size) || |
||||||
|
!GetVarint32(&meta_pos, &entry_index) || |
||||||
|
!GetVarint32(&meta_pos, &num_blocks)) { |
||||||
|
s = Status::Corruption( |
||||||
|
"Corrupted prefix meta block: unable to read from it."); |
||||||
|
break; |
||||||
|
} |
||||||
|
if (pos + prefix_size > prefixes.size()) { |
||||||
|
s = Status::Corruption( |
||||||
|
"Corrupted prefix meta block: size inconsistency."); |
||||||
|
break; |
||||||
|
} |
||||||
|
Slice prefix(prefixes.data() + pos, prefix_size); |
||||||
|
builder.Add(prefix, entry_index, num_blocks); |
||||||
|
|
||||||
|
pos += prefix_size; |
||||||
|
} |
||||||
|
|
||||||
|
if (s.ok() && pos != prefixes.size()) { |
||||||
|
s = Status::Corruption("Corrupted prefix meta block"); |
||||||
|
} |
||||||
|
|
||||||
|
if (s.ok()) { |
||||||
|
*prefix_index = builder.Finish(); |
||||||
|
} |
||||||
|
|
||||||
|
return s; |
||||||
|
} |
||||||
|
|
||||||
|
const uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, |
||||||
|
uint32_t** blocks) { |
||||||
|
Slice prefix = internal_prefix_extractor_->Transform(key); |
||||||
|
|
||||||
|
uint32_t bucket = PrefixToBucket(prefix, num_buckets_); |
||||||
|
uint32_t block_id = buckets_[bucket]; |
||||||
|
|
||||||
|
if (IsNone(block_id)) { |
||||||
|
return 0; |
||||||
|
} else if (IsBlockId(block_id)) { |
||||||
|
*blocks = &buckets_[bucket]; |
||||||
|
return 1; |
||||||
|
} else { |
||||||
|
uint32_t index = DecodeIndex(block_id); |
||||||
|
assert(index < num_block_array_buffer_entries_); |
||||||
|
*blocks = &block_array_buffer_[index+1]; |
||||||
|
uint32_t num_blocks = block_array_buffer_[index]; |
||||||
|
assert(num_blocks > 1); |
||||||
|
assert(index + num_blocks < num_block_array_buffer_entries_); |
||||||
|
return num_blocks; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
} // namespace rocksdb
|
@ -0,0 +1,67 @@ |
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
#pragma once |
||||||
|
|
||||||
|
#include "rocksdb/status.h" |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
|
||||||
|
class Comparator; |
||||||
|
class Iterator; |
||||||
|
class Slice; |
||||||
|
class SliceTransform; |
||||||
|
|
||||||
|
// Build a hash-based index to speed up the lookup for "index block".
|
||||||
|
// BlockHashIndex accepts a key and, if found, returns its restart index within
|
||||||
|
// that index block.
|
||||||
|
class BlockPrefixIndex { |
||||||
|
public: |
||||||
|
|
||||||
|
// Maps a key to a list of data blocks that could potentially contain
|
||||||
|
// the key, based on the prefix.
|
||||||
|
// Returns the total number of relevant blocks, 0 means the key does
|
||||||
|
// not exist.
|
||||||
|
const uint32_t GetBlocks(const Slice& key, uint32_t** blocks); |
||||||
|
|
||||||
|
size_t ApproximateMemoryUsage() const { |
||||||
|
return sizeof(BlockPrefixIndex) + |
||||||
|
(num_block_array_buffer_entries_ + num_buckets_) * sizeof(uint32_t); |
||||||
|
} |
||||||
|
|
||||||
|
// Create hash index by reading from the metadata blocks.
|
||||||
|
// @params prefixes: a sequence of prefixes.
|
||||||
|
// @params prefix_meta: contains the "metadata" to of the prefixes.
|
||||||
|
static Status Create(const SliceTransform* hash_key_extractor, |
||||||
|
const Slice& prefixes, const Slice& prefix_meta, |
||||||
|
BlockPrefixIndex** prefix_index); |
||||||
|
|
||||||
|
~BlockPrefixIndex() { |
||||||
|
delete[] buckets_; |
||||||
|
delete[] block_array_buffer_; |
||||||
|
} |
||||||
|
|
||||||
|
private: |
||||||
|
class Builder; |
||||||
|
friend Builder; |
||||||
|
|
||||||
|
BlockPrefixIndex(const SliceTransform* internal_prefix_extractor, |
||||||
|
uint32_t num_buckets, |
||||||
|
uint32_t* buckets, |
||||||
|
uint32_t num_block_array_buffer_entries, |
||||||
|
uint32_t* block_array_buffer) |
||||||
|
: internal_prefix_extractor_(internal_prefix_extractor), |
||||||
|
num_buckets_(num_buckets), |
||||||
|
num_block_array_buffer_entries_(num_block_array_buffer_entries), |
||||||
|
buckets_(buckets), |
||||||
|
block_array_buffer_(block_array_buffer) {} |
||||||
|
|
||||||
|
const SliceTransform* internal_prefix_extractor_; |
||||||
|
uint32_t num_buckets_; |
||||||
|
uint32_t num_block_array_buffer_entries_; |
||||||
|
uint32_t* buckets_; |
||||||
|
uint32_t* block_array_buffer_; |
||||||
|
}; |
||||||
|
|
||||||
|
} // namespace rocksdb
|
Loading…
Reference in new issue