Summary: Currently, the in-memory hash index of blockbased table uses a precise hash map to track the prefix to block range mapping. In some use cases, especially when prefix itself is big, the memory overhead becomes a problem. This diff introduces a fixed hash bucket array that does not store the prefix and allows prefix collision, which is similar to the plaintable hash index, in order to reduce the memory consumption. Just a quick draft, still testing and refining. Test Plan: unit test and shadow testing Reviewers: dhruba, kailiu, sdong Reviewed By: sdong Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D19047main
							parent
							
								
									3525aac9e5
								
							
						
					
					
						commit
						0f0076ed5a
					
				| @ -0,0 +1,242 @@ | ||||
| // Copyright (c) 2014, Facebook, Inc. All rights reserved.
 | ||||
| // This source code is licensed under the BSD-style license found in the
 | ||||
| // LICENSE file in the root directory of this source tree. An additional grant
 | ||||
| // of patent rights can be found in the PATENTS file in the same directory.
 | ||||
| 
 | ||||
| #include "table/block_prefix_index.h" | ||||
| 
 | ||||
| #include <vector> | ||||
| 
 | ||||
| #include "rocksdb/comparator.h" | ||||
| #include "rocksdb/slice.h" | ||||
| #include "rocksdb/slice_transform.h" | ||||
| #include "util/arena.h" | ||||
| #include "util/coding.h" | ||||
| #include "util/hash.h" | ||||
| 
 | ||||
| namespace rocksdb { | ||||
| 
 | ||||
| namespace { | ||||
| 
 | ||||
| inline uint32_t Hash(const Slice& s) { | ||||
|   return rocksdb::Hash(s.data(), s.size(), 0); | ||||
| } | ||||
| 
 | ||||
| inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) { | ||||
|   return Hash(prefix) % num_buckets; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| // The prefix block index is simply a bucket array, with each entry pointing to
 | ||||
| // the blocks that span the prefixes hashed to this bucket.
 | ||||
| //
 | ||||
| // To reduce memory footprint, if there is only one block per bucket, the entry
 | ||||
| // stores the block id directly. If there are more than one blocks per bucket,
 | ||||
| // because of hash collision or a single prefix spanning multiple blocks,
 | ||||
| // the entry points to an array of block ids. The block array is an array of
 | ||||
| // uint32_t's. The first uint32_t indicates the total number of blocks, followed
 | ||||
| // by the block ids.
 | ||||
| //
 | ||||
| // To differentiate the two cases, the high order bit of the entry indicates
 | ||||
| // whether it is a 'pointer' into a separate block array.
 | ||||
| // 0x7FFFFFFF is reserved for empty bucket.
 | ||||
| 
 | ||||
| const uint32_t kNoneBlock = 0x7FFFFFFF; | ||||
| const uint32_t kBlockArrayMask = 0x80000000; | ||||
| 
 | ||||
| inline bool IsNone(uint32_t block_id) { | ||||
|   return block_id == kNoneBlock; | ||||
| } | ||||
| 
 | ||||
| inline bool IsBlockId(uint32_t block_id) { | ||||
|   return (block_id & kBlockArrayMask) == 0; | ||||
| } | ||||
| 
 | ||||
| inline uint32_t DecodeIndex(uint32_t block_id) { | ||||
|   uint32_t index = block_id ^ kBlockArrayMask; | ||||
|   assert(index < kBlockArrayMask); | ||||
|   return index; | ||||
| } | ||||
| 
 | ||||
| inline uint32_t EncodeIndex(uint32_t index) { | ||||
|   assert(index < kBlockArrayMask); | ||||
|   return index | kBlockArrayMask; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| // temporary storage for prefix information during index building
 | ||||
| struct PrefixRecord { | ||||
|   Slice prefix; | ||||
|   uint32_t start_block; | ||||
|   uint32_t end_block; | ||||
|   uint32_t num_blocks; | ||||
|   PrefixRecord* next; | ||||
| }; | ||||
| 
 | ||||
| }  // anonymous namespace
 | ||||
| 
 | ||||
| class BlockPrefixIndex::Builder { | ||||
|  public: | ||||
|   explicit Builder(const SliceTransform* internal_prefix_extractor) | ||||
|       : internal_prefix_extractor_(internal_prefix_extractor) {} | ||||
| 
 | ||||
|   void Add(const Slice& key_prefix, uint32_t start_block, | ||||
|            uint32_t num_blocks) { | ||||
|     PrefixRecord* record = reinterpret_cast<PrefixRecord*>( | ||||
|       arena_.AllocateAligned(sizeof(PrefixRecord))); | ||||
|     record->prefix = key_prefix; | ||||
|     record->start_block = start_block; | ||||
|     record->end_block = start_block + num_blocks - 1; | ||||
|     record->num_blocks = num_blocks; | ||||
|     prefixes_.push_back(record); | ||||
|   } | ||||
| 
 | ||||
|   BlockPrefixIndex* Finish() { | ||||
|     // For now, use roughly 1:1 prefix to bucket ratio.
 | ||||
|     uint32_t num_buckets = prefixes_.size() + 1; | ||||
| 
 | ||||
|     // Collect prefix records that hash to the same bucket, into a single
 | ||||
|     // linklist.
 | ||||
|     std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr); | ||||
|     std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0); | ||||
|     for (PrefixRecord* current : prefixes_) { | ||||
|       uint32_t bucket = PrefixToBucket(current->prefix, num_buckets); | ||||
|       // merge the prefix block span if the first block of this prefix is
 | ||||
|       // connected to the last block of the previous prefix.
 | ||||
|       PrefixRecord* prev = prefixes_per_bucket[bucket]; | ||||
|       if (prev) { | ||||
|         assert(current->start_block >= prev->end_block); | ||||
|         auto distance = current->start_block - prev->end_block; | ||||
|         if (distance <= 1) { | ||||
|           prev->end_block = current->end_block; | ||||
|           prev->num_blocks = prev->end_block - prev->start_block + 1; | ||||
|           continue; | ||||
|         } | ||||
|       } | ||||
|       current->next = prev; | ||||
|       prefixes_per_bucket[bucket] = current; | ||||
|       num_blocks_per_bucket[bucket] += current->num_blocks; | ||||
|     } | ||||
| 
 | ||||
|     // Calculate the block array buffer size
 | ||||
|     uint32_t total_block_array_entries = 0; | ||||
|     for (uint32_t i = 0; i < num_buckets; i++) { | ||||
|       uint32_t num_blocks = num_blocks_per_bucket[i]; | ||||
|       if (num_blocks > 1) { | ||||
|         total_block_array_entries += (num_blocks + 1); | ||||
|       } | ||||
|     } | ||||
| 
 | ||||
|     // Populate the final prefix block index
 | ||||
|     uint32_t* block_array_buffer = new uint32_t[total_block_array_entries]; | ||||
|     uint32_t* buckets = new uint32_t[num_buckets]; | ||||
|     uint32_t offset = 0; | ||||
|     for (uint32_t i = 0; i < num_buckets; i++) { | ||||
|       uint32_t num_blocks = num_blocks_per_bucket[i]; | ||||
|       if (num_blocks == 0) { | ||||
|         assert(prefixes_per_bucket[i] == nullptr); | ||||
|         buckets[i] = kNoneBlock; | ||||
|       } else if (num_blocks == 1) { | ||||
|         assert(prefixes_per_bucket[i] != nullptr); | ||||
|         assert(prefixes_per_bucket[i]->next == nullptr); | ||||
|         buckets[i] = prefixes_per_bucket[i]->start_block; | ||||
|       } else { | ||||
|         assert(prefixes_per_bucket[i] != nullptr); | ||||
|         buckets[i] = EncodeIndex(offset); | ||||
|         block_array_buffer[offset] = num_blocks; | ||||
|         uint32_t* last_block = &block_array_buffer[offset + num_blocks]; | ||||
|         auto current = prefixes_per_bucket[i]; | ||||
|         // populate block ids from largest to smallest
 | ||||
|         while (current != nullptr) { | ||||
|           for (uint32_t i = 0; i < current->num_blocks; i++) { | ||||
|             *last_block = current->end_block - i; | ||||
|             last_block--; | ||||
|           } | ||||
|           current = current->next; | ||||
|         } | ||||
|         assert(last_block == &block_array_buffer[offset]); | ||||
|         offset += (num_blocks + 1); | ||||
|       } | ||||
|     } | ||||
| 
 | ||||
|     assert(offset == total_block_array_entries); | ||||
| 
 | ||||
|     return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets, | ||||
|                                 buckets, total_block_array_entries, | ||||
|                                 block_array_buffer); | ||||
|   } | ||||
| 
 | ||||
|  private: | ||||
|   const SliceTransform* internal_prefix_extractor_; | ||||
| 
 | ||||
|   std::vector<PrefixRecord*> prefixes_; | ||||
|   Arena arena_; | ||||
| }; | ||||
| 
 | ||||
| 
 | ||||
| Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor, | ||||
|                                 const Slice& prefixes, const Slice& prefix_meta, | ||||
|                                 BlockPrefixIndex** prefix_index) { | ||||
|   uint64_t pos = 0; | ||||
|   auto meta_pos = prefix_meta; | ||||
|   Status s; | ||||
|   Builder builder(internal_prefix_extractor); | ||||
| 
 | ||||
|   while (!meta_pos.empty()) { | ||||
|     uint32_t prefix_size = 0; | ||||
|     uint32_t entry_index = 0; | ||||
|     uint32_t num_blocks = 0; | ||||
|     if (!GetVarint32(&meta_pos, &prefix_size) || | ||||
|         !GetVarint32(&meta_pos, &entry_index) || | ||||
|         !GetVarint32(&meta_pos, &num_blocks)) { | ||||
|       s = Status::Corruption( | ||||
|           "Corrupted prefix meta block: unable to read from it."); | ||||
|       break; | ||||
|     } | ||||
|     if (pos + prefix_size > prefixes.size()) { | ||||
|       s = Status::Corruption( | ||||
|         "Corrupted prefix meta block: size inconsistency."); | ||||
|       break; | ||||
|     } | ||||
|     Slice prefix(prefixes.data() + pos, prefix_size); | ||||
|     builder.Add(prefix, entry_index, num_blocks); | ||||
| 
 | ||||
|     pos += prefix_size; | ||||
|   } | ||||
| 
 | ||||
|   if (s.ok() && pos != prefixes.size()) { | ||||
|     s = Status::Corruption("Corrupted prefix meta block"); | ||||
|   } | ||||
| 
 | ||||
|   if (s.ok()) { | ||||
|     *prefix_index = builder.Finish(); | ||||
|   } | ||||
| 
 | ||||
|   return s; | ||||
| } | ||||
| 
 | ||||
| const uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, | ||||
|                                            uint32_t** blocks) { | ||||
|   Slice prefix = internal_prefix_extractor_->Transform(key); | ||||
| 
 | ||||
|   uint32_t bucket = PrefixToBucket(prefix, num_buckets_); | ||||
|   uint32_t block_id = buckets_[bucket]; | ||||
| 
 | ||||
|   if (IsNone(block_id)) { | ||||
|     return 0; | ||||
|   } else if (IsBlockId(block_id)) { | ||||
|     *blocks = &buckets_[bucket]; | ||||
|     return 1; | ||||
|   } else { | ||||
|     uint32_t index = DecodeIndex(block_id); | ||||
|     assert(index < num_block_array_buffer_entries_); | ||||
|     *blocks = &block_array_buffer_[index+1]; | ||||
|     uint32_t num_blocks = block_array_buffer_[index]; | ||||
|     assert(num_blocks > 1); | ||||
|     assert(index + num_blocks < num_block_array_buffer_entries_); | ||||
|     return num_blocks; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| }  // namespace rocksdb
 | ||||
| @ -0,0 +1,67 @@ | ||||
| // Copyright (c) 2013, Facebook, Inc. All rights reserved.
 | ||||
| // This source code is licensed under the BSD-style license found in the
 | ||||
| // LICENSE file in the root directory of this source tree. An additional grant
 | ||||
| // of patent rights can be found in the PATENTS file in the same directory.
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include "rocksdb/status.h" | ||||
| 
 | ||||
| namespace rocksdb { | ||||
| 
 | ||||
| class Comparator; | ||||
| class Iterator; | ||||
| class Slice; | ||||
| class SliceTransform; | ||||
| 
 | ||||
| // Build a hash-based index to speed up the lookup for "index block".
 | ||||
| // BlockHashIndex accepts a key and, if found, returns its restart index within
 | ||||
| // that index block.
 | ||||
| class BlockPrefixIndex { | ||||
|  public: | ||||
| 
 | ||||
|   // Maps a key to a list of data blocks that could potentially contain
 | ||||
|   // the key, based on the prefix.
 | ||||
|   // Returns the total number of relevant blocks, 0 means the key does
 | ||||
|   // not exist.
 | ||||
|   const uint32_t GetBlocks(const Slice& key, uint32_t** blocks); | ||||
| 
 | ||||
|   size_t ApproximateMemoryUsage() const { | ||||
|     return sizeof(BlockPrefixIndex) + | ||||
|       (num_block_array_buffer_entries_ + num_buckets_) * sizeof(uint32_t); | ||||
|   } | ||||
| 
 | ||||
|   // Create hash index by reading from the metadata blocks.
 | ||||
|   // @params prefixes: a sequence of prefixes.
 | ||||
|   // @params prefix_meta: contains the "metadata" to of the prefixes.
 | ||||
|   static Status Create(const SliceTransform* hash_key_extractor, | ||||
|                        const Slice& prefixes, const Slice& prefix_meta, | ||||
|                        BlockPrefixIndex** prefix_index); | ||||
| 
 | ||||
|   ~BlockPrefixIndex() { | ||||
|     delete[] buckets_; | ||||
|     delete[] block_array_buffer_; | ||||
|   } | ||||
| 
 | ||||
|  private: | ||||
|   class Builder; | ||||
|   friend Builder; | ||||
| 
 | ||||
|   BlockPrefixIndex(const SliceTransform* internal_prefix_extractor, | ||||
|                    uint32_t num_buckets, | ||||
|                    uint32_t* buckets, | ||||
|                    uint32_t num_block_array_buffer_entries, | ||||
|                    uint32_t* block_array_buffer) | ||||
|       : internal_prefix_extractor_(internal_prefix_extractor), | ||||
|         num_buckets_(num_buckets), | ||||
|         num_block_array_buffer_entries_(num_block_array_buffer_entries), | ||||
|         buckets_(buckets), | ||||
|         block_array_buffer_(block_array_buffer) {} | ||||
| 
 | ||||
|   const SliceTransform* internal_prefix_extractor_; | ||||
|   uint32_t num_buckets_; | ||||
|   uint32_t num_block_array_buffer_entries_; | ||||
|   uint32_t* buckets_; | ||||
|   uint32_t* block_array_buffer_; | ||||
| }; | ||||
| 
 | ||||
| }  // namespace rocksdb
 | ||||
					Loading…
					
					
				
		Reference in new issue
	
	 Haobo Xu
						Haobo Xu