fork of https://github.com/rust-rocksdb/rust-rocksdb for nextgraph
				
			
			
		
			You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							226 lines
						
					
					
						
							7.6 KiB
						
					
					
				
			
		
		
	
	
							226 lines
						
					
					
						
							7.6 KiB
						
					
					
				| // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
 | |
| //  This source code is licensed under both the GPLv2 (found in the
 | |
| //  COPYING file in the root directory) and Apache 2.0 License
 | |
| //  (found in the LICENSE.Apache file in the root directory).
 | |
| 
 | |
| #include "table/block_based/block_prefix_index.h"
 | |
| 
 | |
| #include <vector>
 | |
| 
 | |
| #include "memory/arena.h"
 | |
| #include "rocksdb/comparator.h"
 | |
| #include "rocksdb/slice.h"
 | |
| #include "rocksdb/slice_transform.h"
 | |
| #include "util/coding.h"
 | |
| #include "util/hash.h"
 | |
| 
 | |
| namespace ROCKSDB_NAMESPACE {
 | |
| 
 | |
| inline uint32_t Hash(const Slice& s) {
 | |
|   return ROCKSDB_NAMESPACE::Hash(s.data(), s.size(), 0);
 | |
| }
 | |
| 
 | |
| inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
 | |
|   return Hash(prefix) % num_buckets;
 | |
| }
 | |
| 
 | |
| // The prefix block index is simply a bucket array, with each entry pointing to
 | |
| // the blocks that span the prefixes hashed to this bucket.
 | |
| //
 | |
| // To reduce memory footprint, if there is only one block per bucket, the entry
 | |
| // stores the block id directly. If there are more than one blocks per bucket,
 | |
| // because of hash collision or a single prefix spanning multiple blocks,
 | |
| // the entry points to an array of block ids. The block array is an array of
 | |
| // uint32_t's. The first uint32_t indicates the total number of blocks, followed
 | |
| // by the block ids.
 | |
| //
 | |
| // To differentiate the two cases, the high order bit of the entry indicates
 | |
| // whether it is a 'pointer' into a separate block array.
 | |
| // 0x7FFFFFFF is reserved for empty bucket.
 | |
| 
 | |
| const uint32_t kNoneBlock = 0x7FFFFFFF;
 | |
| const uint32_t kBlockArrayMask = 0x80000000;
 | |
| 
 | |
| inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; }
 | |
| 
 | |
| inline bool IsBlockId(uint32_t block_id) {
 | |
|   return (block_id & kBlockArrayMask) == 0;
 | |
| }
 | |
| 
 | |
| inline uint32_t DecodeIndex(uint32_t block_id) {
 | |
|   uint32_t index = block_id ^ kBlockArrayMask;
 | |
|   assert(index < kBlockArrayMask);
 | |
|   return index;
 | |
| }
 | |
| 
 | |
| inline uint32_t EncodeIndex(uint32_t index) {
 | |
|   assert(index < kBlockArrayMask);
 | |
|   return index | kBlockArrayMask;
 | |
| }
 | |
| 
 | |
| // temporary storage for prefix information during index building
 | |
| struct PrefixRecord {
 | |
|   Slice prefix;
 | |
|   uint32_t start_block;
 | |
|   uint32_t end_block;
 | |
|   uint32_t num_blocks;
 | |
|   PrefixRecord* next;
 | |
| };
 | |
| 
 | |
| class BlockPrefixIndex::Builder {
 | |
|  public:
 | |
|   void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) {
 | |
|     PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
 | |
|         arena_.AllocateAligned(sizeof(PrefixRecord)));
 | |
|     record->prefix = key_prefix;
 | |
|     record->start_block = start_block;
 | |
|     record->end_block = start_block + num_blocks - 1;
 | |
|     record->num_blocks = num_blocks;
 | |
|     prefixes_.push_back(record);
 | |
|   }
 | |
| 
 | |
|   BlockPrefixIndex* Finish(const SliceTransform* prefix_extractor) {
 | |
|     // For now, use roughly 1:1 prefix to bucket ratio.
 | |
|     uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
 | |
| 
 | |
|     // Collect prefix records that hash to the same bucket, into a single
 | |
|     // linklist.
 | |
|     std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
 | |
|     std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
 | |
|     for (PrefixRecord* current : prefixes_) {
 | |
|       uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
 | |
|       // merge the prefix block span if the first block of this prefix is
 | |
|       // connected to the last block of the previous prefix.
 | |
|       PrefixRecord* prev = prefixes_per_bucket[bucket];
 | |
|       if (prev) {
 | |
|         assert(current->start_block >= prev->end_block);
 | |
|         auto distance = current->start_block - prev->end_block;
 | |
|         if (distance <= 1) {
 | |
|           prev->end_block = current->end_block;
 | |
|           prev->num_blocks = prev->end_block - prev->start_block + 1;
 | |
|           num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
 | |
|           continue;
 | |
|         }
 | |
|       }
 | |
|       current->next = prev;
 | |
|       prefixes_per_bucket[bucket] = current;
 | |
|       num_blocks_per_bucket[bucket] += current->num_blocks;
 | |
|     }
 | |
| 
 | |
|     // Calculate the block array buffer size
 | |
|     uint32_t total_block_array_entries = 0;
 | |
|     for (uint32_t i = 0; i < num_buckets; i++) {
 | |
|       uint32_t num_blocks = num_blocks_per_bucket[i];
 | |
|       if (num_blocks > 1) {
 | |
|         total_block_array_entries += (num_blocks + 1);
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     // Populate the final prefix block index
 | |
|     uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
 | |
|     uint32_t* buckets = new uint32_t[num_buckets];
 | |
|     uint32_t offset = 0;
 | |
|     for (uint32_t i = 0; i < num_buckets; i++) {
 | |
|       uint32_t num_blocks = num_blocks_per_bucket[i];
 | |
|       if (num_blocks == 0) {
 | |
|         assert(prefixes_per_bucket[i] == nullptr);
 | |
|         buckets[i] = kNoneBlock;
 | |
|       } else if (num_blocks == 1) {
 | |
|         assert(prefixes_per_bucket[i] != nullptr);
 | |
|         assert(prefixes_per_bucket[i]->next == nullptr);
 | |
|         buckets[i] = prefixes_per_bucket[i]->start_block;
 | |
|       } else {
 | |
|         assert(total_block_array_entries > 0);
 | |
|         assert(prefixes_per_bucket[i] != nullptr);
 | |
|         buckets[i] = EncodeIndex(offset);
 | |
|         block_array_buffer[offset] = num_blocks;
 | |
|         uint32_t* last_block = &block_array_buffer[offset + num_blocks];
 | |
|         auto current = prefixes_per_bucket[i];
 | |
|         // populate block ids from largest to smallest
 | |
|         while (current != nullptr) {
 | |
|           for (uint32_t iter = 0; iter < current->num_blocks; iter++) {
 | |
|             *last_block = current->end_block - iter;
 | |
|             last_block--;
 | |
|           }
 | |
|           current = current->next;
 | |
|         }
 | |
|         assert(last_block == &block_array_buffer[offset]);
 | |
|         offset += (num_blocks + 1);
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     assert(offset == total_block_array_entries);
 | |
| 
 | |
|     return new BlockPrefixIndex(prefix_extractor, num_buckets, buckets,
 | |
|                                 total_block_array_entries, block_array_buffer);
 | |
|   }
 | |
| 
 | |
|  private:
 | |
|   std::vector<PrefixRecord*> prefixes_;
 | |
|   Arena arena_;
 | |
| };
 | |
| 
 | |
| Status BlockPrefixIndex::Create(const SliceTransform* prefix_extractor,
 | |
|                                 const Slice& prefixes, const Slice& prefix_meta,
 | |
|                                 BlockPrefixIndex** prefix_index) {
 | |
|   uint64_t pos = 0;
 | |
|   auto meta_pos = prefix_meta;
 | |
|   Status s;
 | |
|   Builder builder;
 | |
| 
 | |
|   while (!meta_pos.empty()) {
 | |
|     uint32_t prefix_size = 0;
 | |
|     uint32_t entry_index = 0;
 | |
|     uint32_t num_blocks = 0;
 | |
|     if (!GetVarint32(&meta_pos, &prefix_size) ||
 | |
|         !GetVarint32(&meta_pos, &entry_index) ||
 | |
|         !GetVarint32(&meta_pos, &num_blocks)) {
 | |
|       s = Status::Corruption(
 | |
|           "Corrupted prefix meta block: unable to read from it.");
 | |
|       break;
 | |
|     }
 | |
|     if (pos + prefix_size > prefixes.size()) {
 | |
|       s = Status::Corruption(
 | |
|           "Corrupted prefix meta block: size inconsistency.");
 | |
|       break;
 | |
|     }
 | |
|     Slice prefix(prefixes.data() + pos, prefix_size);
 | |
|     builder.Add(prefix, entry_index, num_blocks);
 | |
| 
 | |
|     pos += prefix_size;
 | |
|   }
 | |
| 
 | |
|   if (s.ok() && pos != prefixes.size()) {
 | |
|     s = Status::Corruption("Corrupted prefix meta block");
 | |
|   }
 | |
| 
 | |
|   if (s.ok()) {
 | |
|     *prefix_index = builder.Finish(prefix_extractor);
 | |
|   }
 | |
| 
 | |
|   return s;
 | |
| }
 | |
| 
 | |
| uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) {
 | |
|   Slice prefix = internal_prefix_extractor_.Transform(key);
 | |
| 
 | |
|   uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
 | |
|   uint32_t block_id = buckets_[bucket];
 | |
| 
 | |
|   if (IsNone(block_id)) {
 | |
|     return 0;
 | |
|   } else if (IsBlockId(block_id)) {
 | |
|     *blocks = &buckets_[bucket];
 | |
|     return 1;
 | |
|   } else {
 | |
|     uint32_t index = DecodeIndex(block_id);
 | |
|     assert(index < num_block_array_buffer_entries_);
 | |
|     *blocks = &block_array_buffer_[index + 1];
 | |
|     uint32_t num_blocks = block_array_buffer_[index];
 | |
|     assert(num_blocks > 1);
 | |
|     assert(index + num_blocks < num_block_array_buffer_entries_);
 | |
|     return num_blocks;
 | |
|   }
 | |
| }
 | |
| 
 | |
| }  // namespace ROCKSDB_NAMESPACE
 | |
| 
 |