Summary: Adding option to save PlainTable index and bloom filter in SST file. If there is no bloom block and/or index block, PlainTableReader builds new ones. Otherwise PlainTableReader just use these blocks. Test Plan: make all check Reviewers: sdong Reviewed By: sdong Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D19527main
parent
92d73cbe78
commit
9d70cce047
@ -0,0 +1,23 @@ |
||||
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "table/bloom_block.h" |
||||
|
||||
#include <string> |
||||
#include "rocksdb/slice.h" |
||||
#include "util/dynamic_bloom.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t> keys_hashes) { |
||||
for (auto hash : keys_hashes) { |
||||
bloom_.AddHash(hash); |
||||
} |
||||
} |
||||
|
||||
Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); } |
||||
|
||||
const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock"; |
||||
} // namespace rocksdb
|
@ -0,0 +1,37 @@ |
||||
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once |
||||
|
||||
#include <vector> |
||||
#include <string> |
||||
#include "util/dynamic_bloom.h" |
||||
|
||||
namespace rocksdb { |
||||
class Logger; |
||||
|
||||
class BloomBlockBuilder { |
||||
public: |
||||
static const std::string kBloomBlock; |
||||
|
||||
explicit BloomBlockBuilder(uint32_t num_probes = 6) |
||||
: bloom_(num_probes, nullptr) {} |
||||
|
||||
void SetTotalBits(Arena* arena, uint32_t total_bits, uint32_t locality, |
||||
size_t huge_page_tlb_size, Logger* logger) { |
||||
bloom_.SetTotalBits(arena, total_bits, locality, huge_page_tlb_size, |
||||
logger); |
||||
} |
||||
|
||||
uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); } |
||||
|
||||
void AddKeysHashes(const std::vector<uint32_t> keys_hashes); |
||||
|
||||
Slice Finish(); |
||||
|
||||
private: |
||||
DynamicBloom bloom_; |
||||
}; |
||||
|
||||
}; // namespace rocksdb
|
@ -0,0 +1,196 @@ |
||||
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "table/plain_table_index.h" |
||||
#include "util/coding.h" |
||||
#include "util/hash.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
namespace { |
||||
inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { |
||||
assert(num_buckets > 0); |
||||
return hash % num_buckets; |
||||
} |
||||
} |
||||
|
||||
void PlainTableIndex::InitFromRawData(Slice data) { |
||||
assert(GetVarint32(&data, &index_size_)); |
||||
assert(index_size_ > 0); |
||||
assert(GetVarint32(&data, &num_prefixes_)); |
||||
sub_index_size_ = data.size() - index_size_ * kOffsetLen; |
||||
|
||||
char* index_data_begin = const_cast<char*>(data.data()); |
||||
index_ = reinterpret_cast<uint32_t*>(index_data_begin); |
||||
sub_index_ = reinterpret_cast<char*>(index_ + index_size_); |
||||
} |
||||
|
||||
PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset( |
||||
uint32_t prefix_hash, uint32_t* bucket_value) const { |
||||
int bucket = GetBucketIdFromHash(prefix_hash, index_size_); |
||||
*bucket_value = index_[bucket]; |
||||
if ((*bucket_value & kSubIndexMask) == kSubIndexMask) { |
||||
*bucket_value ^= kSubIndexMask; |
||||
return kSubindex; |
||||
} |
||||
if (*bucket_value >= kMaxFileSize) { |
||||
return kNoPrefixForBucket; |
||||
} else { |
||||
// point directly to the file
|
||||
return kDirectToFile; |
||||
} |
||||
} |
||||
|
||||
void PlainTableIndexBuilder::IndexRecordList::AddRecord(murmur_t hash, |
||||
uint32_t offset) { |
||||
if (num_records_in_current_group_ == kNumRecordsPerGroup) { |
||||
current_group_ = AllocateNewGroup(); |
||||
num_records_in_current_group_ = 0; |
||||
} |
||||
auto& new_record = current_group_[num_records_in_current_group_++]; |
||||
new_record.hash = hash; |
||||
new_record.offset = offset; |
||||
new_record.next = nullptr; |
||||
} |
||||
|
||||
void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice, |
||||
uint64_t key_offset) { |
||||
if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) { |
||||
++num_prefixes_; |
||||
if (!is_first_record_) { |
||||
keys_per_prefix_hist_.Add(num_keys_per_prefix_); |
||||
} |
||||
num_keys_per_prefix_ = 0; |
||||
prev_key_prefix_ = key_prefix_slice.ToString(); |
||||
prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice); |
||||
due_index_ = true; |
||||
} |
||||
|
||||
if (due_index_) { |
||||
// Add an index key for every kIndexIntervalForSamePrefixKeys keys
|
||||
record_list_.AddRecord(prev_key_prefix_hash_, key_offset); |
||||
due_index_ = false; |
||||
} |
||||
|
||||
num_keys_per_prefix_++; |
||||
if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) { |
||||
due_index_ = true; |
||||
} |
||||
is_first_record_ = false; |
||||
} |
||||
|
||||
Slice PlainTableIndexBuilder::Finish() { |
||||
AllocateIndex(); |
||||
std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr); |
||||
std::vector<uint32_t> entries_per_bucket(index_size_, 0); |
||||
BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); |
||||
|
||||
keys_per_prefix_hist_.Add(num_keys_per_prefix_); |
||||
Log(options_.info_log, "Number of Keys per prefix Histogram: %s", |
||||
keys_per_prefix_hist_.ToString().c_str()); |
||||
|
||||
// From the temp data structure, populate indexes.
|
||||
return FillIndexes(hash_to_offsets, entries_per_bucket); |
||||
} |
||||
|
||||
void PlainTableIndexBuilder::AllocateIndex() { |
||||
if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) { |
||||
// Fall back to pure binary search if the user fails to specify a prefix
|
||||
// extractor.
|
||||
index_size_ = 1; |
||||
} else { |
||||
double hash_table_size_multipier = 1.0 / hash_table_ratio_; |
||||
index_size_ = num_prefixes_ * hash_table_size_multipier + 1; |
||||
assert(index_size_ > 0); |
||||
} |
||||
} |
||||
|
||||
void PlainTableIndexBuilder::BucketizeIndexes( |
||||
std::vector<IndexRecord*>* hash_to_offsets, |
||||
std::vector<uint32_t>* entries_per_bucket) { |
||||
bool first = true; |
||||
uint32_t prev_hash = 0; |
||||
size_t num_records = record_list_.GetNumRecords(); |
||||
for (size_t i = 0; i < num_records; i++) { |
||||
IndexRecord* index_record = record_list_.At(i); |
||||
uint32_t cur_hash = index_record->hash; |
||||
if (first || prev_hash != cur_hash) { |
||||
prev_hash = cur_hash; |
||||
first = false; |
||||
} |
||||
uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_); |
||||
IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; |
||||
index_record->next = prev_bucket_head; |
||||
(*hash_to_offsets)[bucket] = index_record; |
||||
(*entries_per_bucket)[bucket]++; |
||||
} |
||||
|
||||
sub_index_size_ = 0; |
||||
for (auto entry_count : *entries_per_bucket) { |
||||
if (entry_count <= 1) { |
||||
continue; |
||||
} |
||||
// Only buckets with more than 1 entry will have subindex.
|
||||
sub_index_size_ += VarintLength(entry_count); |
||||
// total bytes needed to store these entries' in-file offsets.
|
||||
sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen; |
||||
} |
||||
} |
||||
|
||||
Slice PlainTableIndexBuilder::FillIndexes( |
||||
const std::vector<IndexRecord*>& hash_to_offsets, |
||||
const std::vector<uint32_t>& entries_per_bucket) { |
||||
Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index", |
||||
sub_index_size_); |
||||
auto total_allocate_size = GetTotalSize(); |
||||
char* allocated = arena_->AllocateAligned( |
||||
total_allocate_size, huge_page_tlb_size_, options_.info_log.get()); |
||||
|
||||
auto temp_ptr = EncodeVarint32(allocated, index_size_); |
||||
uint32_t* index = |
||||
reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_)); |
||||
char* sub_index = reinterpret_cast<char*>(index + index_size_); |
||||
|
||||
size_t sub_index_offset = 0; |
||||
for (uint32_t i = 0; i < index_size_; i++) { |
||||
uint32_t num_keys_for_bucket = entries_per_bucket[i]; |
||||
switch (num_keys_for_bucket) { |
||||
case 0: |
||||
// No key for bucket
|
||||
index[i] = PlainTableIndex::kMaxFileSize; |
||||
break; |
||||
case 1: |
||||
// point directly to the file offset
|
||||
index[i] = hash_to_offsets[i]->offset; |
||||
break; |
||||
default: |
||||
// point to second level indexes.
|
||||
index[i] = sub_index_offset | PlainTableIndex::kSubIndexMask; |
||||
char* prev_ptr = &sub_index[sub_index_offset]; |
||||
char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); |
||||
sub_index_offset += (cur_ptr - prev_ptr); |
||||
char* sub_index_pos = &sub_index[sub_index_offset]; |
||||
IndexRecord* record = hash_to_offsets[i]; |
||||
int j; |
||||
for (j = num_keys_for_bucket - 1; j >= 0 && record; |
||||
j--, record = record->next) { |
||||
EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset); |
||||
} |
||||
assert(j == -1 && record == nullptr); |
||||
sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket; |
||||
assert(sub_index_offset <= sub_index_size_); |
||||
break; |
||||
} |
||||
} |
||||
assert(sub_index_offset == sub_index_size_); |
||||
|
||||
Log(options_.info_log, "hash table size: %d, suffix_map length %zu", |
||||
index_size_, sub_index_size_); |
||||
return Slice(allocated, GetTotalSize()); |
||||
} |
||||
|
||||
const std::string PlainTableIndexBuilder::kPlainTableIndexBlock = |
||||
"PlainTableIndexBlock"; |
||||
}; // namespace rocksdb
|
@ -0,0 +1,221 @@ |
||||
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
|
||||
#include <string> |
||||
#include <vector> |
||||
|
||||
#include "db/dbformat.h" |
||||
#include "rocksdb/options.h" |
||||
#include "util/murmurhash.h" |
||||
#include "util/hash.h" |
||||
#include "util/arena.h" |
||||
#include "util/histogram.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
// PlainTableIndex contains buckets size of index_size_, each is a
|
||||
// 32-bit integer. The lower 31 bits contain an offset value (explained below)
|
||||
// and the first bit of the integer indicates type of the offset.
|
||||
//
|
||||
// +--------------+------------------------------------------------------+
|
||||
// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
|
||||
// +--------------+------------------------------------------------------+
|
||||
//
|
||||
// Explanation for the "flag bit":
|
||||
//
|
||||
// 0 indicates that the bucket contains only one prefix (no conflict when
|
||||
// hashing this prefix), whose first row starts from this offset of the
|
||||
// file.
|
||||
// 1 indicates that the bucket contains more than one prefixes, or there
|
||||
// are too many rows for one prefix so we need a binary search for it. In
|
||||
// this case, the offset indicates the offset of sub_index_ holding the
|
||||
// binary search indexes of keys for those rows. Those binary search indexes
|
||||
// are organized in this way:
|
||||
//
|
||||
// The first 4 bytes, indicate how many indexes (N) are stored after it. After
|
||||
// it, there are N 32-bit integers, each points of an offset of the file,
|
||||
// which
|
||||
// points to starting of a row. Those offsets need to be guaranteed to be in
|
||||
// ascending order so the keys they are pointing to are also in ascending
|
||||
// order
|
||||
// to make sure we can use them to do binary searches. Below is visual
|
||||
// presentation of a bucket.
|
||||
//
|
||||
// <begin>
|
||||
// number_of_records: varint32
|
||||
// record 1 file offset: fixedint32
|
||||
// record 2 file offset: fixedint32
|
||||
// ....
|
||||
// record N file offset: fixedint32
|
||||
// <end>
|
||||
class PlainTableIndex { |
||||
public: |
||||
enum IndexSearchResult { |
||||
kNoPrefixForBucket = 0, |
||||
kDirectToFile = 1, |
||||
kSubindex = 2 |
||||
}; |
||||
|
||||
explicit PlainTableIndex(Slice data) { InitFromRawData(data); } |
||||
|
||||
PlainTableIndex() |
||||
: index_size_(0), |
||||
sub_index_size_(0), |
||||
num_prefixes_(0), |
||||
index_(nullptr), |
||||
sub_index_(nullptr) {} |
||||
|
||||
IndexSearchResult GetOffset(uint32_t prefix_hash, |
||||
uint32_t* bucket_value) const; |
||||
|
||||
void InitFromRawData(Slice data); |
||||
|
||||
const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset, |
||||
uint32_t* upper_bound) const { |
||||
const char* index_ptr = &sub_index_[offset]; |
||||
return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound); |
||||
} |
||||
|
||||
uint32_t GetIndexSize() const { return index_size_; } |
||||
|
||||
uint32_t GetSubIndexSize() const { return sub_index_size_; } |
||||
|
||||
uint32_t GetNumPrefixes() const { return num_prefixes_; } |
||||
|
||||
static const uint64_t kMaxFileSize = (1u << 31) - 1; |
||||
static const uint32_t kSubIndexMask = 0x80000000; |
||||
static const size_t kOffsetLen = sizeof(uint32_t); |
||||
|
||||
private: |
||||
uint32_t index_size_; |
||||
size_t sub_index_size_; |
||||
uint32_t num_prefixes_; |
||||
|
||||
uint32_t* index_; |
||||
char* sub_index_; |
||||
}; |
||||
|
||||
// PlainTableIndexBuilder is used to create plain table index.
|
||||
// After calling Finish(), it returns Slice, which is usually
|
||||
// used either to initialize PlainTableIndex or
|
||||
// to save index to sst file.
|
||||
// For more details about the index, please refer to:
|
||||
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
|
||||
// #wiki-in-memory-index-format
|
||||
class PlainTableIndexBuilder { |
||||
public: |
||||
PlainTableIndexBuilder(Arena* arena, const Options& options, |
||||
uint32_t index_sparseness, double hash_table_ratio, |
||||
double huge_page_tlb_size) |
||||
: arena_(arena), |
||||
options_(options), |
||||
record_list_(kRecordsPerGroup), |
||||
is_first_record_(true), |
||||
due_index_(false), |
||||
num_prefixes_(0), |
||||
num_keys_per_prefix_(0), |
||||
prev_key_prefix_hash_(0), |
||||
index_sparseness_(index_sparseness), |
||||
prefix_extractor_(options.prefix_extractor.get()), |
||||
hash_table_ratio_(hash_table_ratio), |
||||
huge_page_tlb_size_(huge_page_tlb_size) {} |
||||
|
||||
void AddKeyPrefix(Slice key_prefix_slice, uint64_t key_offset); |
||||
|
||||
Slice Finish(); |
||||
|
||||
uint32_t GetTotalSize() const { |
||||
return VarintLength(index_size_) + VarintLength(num_prefixes_) + |
||||
PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_; |
||||
} |
||||
|
||||
static const std::string kPlainTableIndexBlock; |
||||
|
||||
private: |
||||
struct IndexRecord { |
||||
uint32_t hash; // hash of the prefix
|
||||
uint32_t offset; // offset of a row
|
||||
IndexRecord* next; |
||||
}; |
||||
|
||||
// Helper class to track all the index records
|
||||
class IndexRecordList { |
||||
public: |
||||
explicit IndexRecordList(size_t num_records_per_group) |
||||
: kNumRecordsPerGroup(num_records_per_group), |
||||
current_group_(nullptr), |
||||
num_records_in_current_group_(num_records_per_group) {} |
||||
|
||||
~IndexRecordList() { |
||||
for (size_t i = 0; i < groups_.size(); i++) { |
||||
delete[] groups_[i]; |
||||
} |
||||
} |
||||
|
||||
void AddRecord(murmur_t hash, uint32_t offset); |
||||
|
||||
size_t GetNumRecords() const { |
||||
return (groups_.size() - 1) * kNumRecordsPerGroup + |
||||
num_records_in_current_group_; |
||||
} |
||||
IndexRecord* At(size_t index) { |
||||
return &(groups_[index / kNumRecordsPerGroup] |
||||
[index % kNumRecordsPerGroup]); |
||||
} |
||||
|
||||
private: |
||||
IndexRecord* AllocateNewGroup() { |
||||
IndexRecord* result = new IndexRecord[kNumRecordsPerGroup]; |
||||
groups_.push_back(result); |
||||
return result; |
||||
} |
||||
|
||||
// Each group in `groups_` contains fix-sized records (determined by
|
||||
// kNumRecordsPerGroup). Which can help us minimize the cost if resizing
|
||||
// occurs.
|
||||
const size_t kNumRecordsPerGroup; |
||||
IndexRecord* current_group_; |
||||
// List of arrays allocated
|
||||
std::vector<IndexRecord*> groups_; |
||||
size_t num_records_in_current_group_; |
||||
}; |
||||
|
||||
void AllocateIndex(); |
||||
|
||||
// Internal helper function to bucket index record list to hash buckets.
|
||||
void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets, |
||||
std::vector<uint32_t>* entries_per_bucket); |
||||
|
||||
// Internal helper class to fill the indexes and bloom filters to internal
|
||||
// data structures.
|
||||
Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets, |
||||
const std::vector<uint32_t>& entries_per_bucket); |
||||
|
||||
Arena* arena_; |
||||
Options options_; |
||||
HistogramImpl keys_per_prefix_hist_; |
||||
IndexRecordList record_list_; |
||||
bool is_first_record_; |
||||
bool due_index_; |
||||
uint32_t num_prefixes_; |
||||
uint32_t num_keys_per_prefix_; |
||||
|
||||
uint32_t prev_key_prefix_hash_; |
||||
uint32_t index_sparseness_; |
||||
uint32_t index_size_; |
||||
size_t sub_index_size_; |
||||
|
||||
const SliceTransform* prefix_extractor_; |
||||
double hash_table_ratio_; |
||||
double huge_page_tlb_size_; |
||||
|
||||
std::string prev_key_prefix_; |
||||
|
||||
static const size_t kRecordsPerGroup = 256; |
||||
}; |
||||
|
||||
}; // namespace rocksdb
|
Loading…
Reference in new issue