Conflicts: HISTORY.md db/db_impl.cc db/db_impl.h db/db_iter.cc db/db_test.cc db/dbformat.h db/memtable.cc db/memtable_list.cc db/memtable_list.h db/table_cache.cc db/table_cache.h db/version_edit.h db/version_set.cc db/version_set.h db/write_batch.cc db/write_batch_test.cc include/rocksdb/options.h util/options.ccmain
commit
0143abdbb0
@ -0,0 +1,337 @@ |
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
#include <algorithm> |
||||
#include <set> |
||||
|
||||
#include "db/db_impl.h" |
||||
#include "db/filename.h" |
||||
#include "db/version_set.h" |
||||
#include "db/write_batch_internal.h" |
||||
#include "rocksdb/cache.h" |
||||
#include "rocksdb/compaction_filter.h" |
||||
#include "rocksdb/db.h" |
||||
#include "rocksdb/env.h" |
||||
#include "rocksdb/filter_policy.h" |
||||
#include "rocksdb/slice_transform.h" |
||||
#include "rocksdb/table.h" |
||||
#include "table/plain_table_factory.h" |
||||
#include "util/hash.h" |
||||
#include "util/logging.h" |
||||
#include "util/mutexlock.h" |
||||
#include "util/testharness.h" |
||||
#include "util/testutil.h" |
||||
#include "utilities/merge_operators.h" |
||||
|
||||
using std::unique_ptr; |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class PlainTableDBTest { |
||||
protected: |
||||
private: |
||||
std::string dbname_; |
||||
Env* env_; |
||||
DB* db_; |
||||
|
||||
Options last_options_; |
||||
static std::unique_ptr<const SliceTransform> prefix_transform; |
||||
|
||||
public: |
||||
PlainTableDBTest() : env_(Env::Default()) { |
||||
dbname_ = test::TmpDir() + "/plain_table_db_test"; |
||||
ASSERT_OK(DestroyDB(dbname_, Options())); |
||||
db_ = nullptr; |
||||
Reopen(); |
||||
} |
||||
|
||||
~PlainTableDBTest() { |
||||
delete db_; |
||||
ASSERT_OK(DestroyDB(dbname_, Options())); |
||||
} |
||||
|
||||
// Return the current option configuration.
|
||||
Options CurrentOptions() { |
||||
Options options; |
||||
options.table_factory.reset(new PlainTableFactory(16, 2, 0.8)); |
||||
options.prefix_extractor = prefix_transform.get(); |
||||
options.allow_mmap_reads = true; |
||||
return options; |
||||
} |
||||
|
||||
DBImpl* dbfull() { |
||||
return reinterpret_cast<DBImpl*>(db_); |
||||
} |
||||
|
||||
void Reopen(Options* options = nullptr) { |
||||
ASSERT_OK(TryReopen(options)); |
||||
} |
||||
|
||||
void Close() { |
||||
delete db_; |
||||
db_ = nullptr; |
||||
} |
||||
|
||||
void DestroyAndReopen(Options* options = nullptr) { |
||||
//Destroy using last options
|
||||
Destroy(&last_options_); |
||||
ASSERT_OK(TryReopen(options)); |
||||
} |
||||
|
||||
void Destroy(Options* options) { |
||||
delete db_; |
||||
db_ = nullptr; |
||||
ASSERT_OK(DestroyDB(dbname_, *options)); |
||||
} |
||||
|
||||
Status PureReopen(Options* options, DB** db) { |
||||
return DB::Open(*options, dbname_, db); |
||||
} |
||||
|
||||
Status TryReopen(Options* options = nullptr) { |
||||
delete db_; |
||||
db_ = nullptr; |
||||
Options opts; |
||||
if (options != nullptr) { |
||||
opts = *options; |
||||
} else { |
||||
opts = CurrentOptions(); |
||||
opts.create_if_missing = true; |
||||
} |
||||
last_options_ = opts; |
||||
|
||||
return DB::Open(opts, dbname_, &db_); |
||||
} |
||||
|
||||
Status Put(const Slice& k, const Slice& v) { |
||||
return db_->Put(WriteOptions(), k, v); |
||||
} |
||||
|
||||
Status Delete(const std::string& k) { |
||||
return db_->Delete(WriteOptions(), k); |
||||
} |
||||
|
||||
std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { |
||||
ReadOptions options; |
||||
options.snapshot = snapshot; |
||||
std::string result; |
||||
Status s = db_->Get(options, k, &result); |
||||
if (s.IsNotFound()) { |
||||
result = "NOT_FOUND"; |
||||
} else if (!s.ok()) { |
||||
result = s.ToString(); |
||||
} |
||||
return result; |
||||
} |
||||
|
||||
|
||||
int NumTableFilesAtLevel(int level) { |
||||
std::string property; |
||||
ASSERT_TRUE( |
||||
db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), |
||||
&property)); |
||||
return atoi(property.c_str()); |
||||
} |
||||
|
||||
// Return spread of files per level
|
||||
std::string FilesPerLevel() { |
||||
std::string result; |
||||
int last_non_zero_offset = 0; |
||||
for (int level = 0; level < db_->NumberLevels(); level++) { |
||||
int f = NumTableFilesAtLevel(level); |
||||
char buf[100]; |
||||
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); |
||||
result += buf; |
||||
if (f > 0) { |
||||
last_non_zero_offset = result.size(); |
||||
} |
||||
} |
||||
result.resize(last_non_zero_offset); |
||||
return result; |
||||
} |
||||
|
||||
std::string IterStatus(Iterator* iter) { |
||||
std::string result; |
||||
if (iter->Valid()) { |
||||
result = iter->key().ToString() + "->" + iter->value().ToString(); |
||||
} else { |
||||
result = "(invalid)"; |
||||
} |
||||
return result; |
||||
} |
||||
}; |
||||
|
||||
std::unique_ptr<const SliceTransform> PlainTableDBTest::prefix_transform( |
||||
NewFixedPrefixTransform(8)); |
||||
|
||||
TEST(PlainTableDBTest, Empty) { |
||||
ASSERT_TRUE(dbfull() != nullptr); |
||||
ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); |
||||
} |
||||
|
||||
TEST(PlainTableDBTest, ReadWrite) { |
||||
ASSERT_OK(Put("1000000000000foo", "v1")); |
||||
ASSERT_EQ("v1", Get("1000000000000foo")); |
||||
ASSERT_OK(Put("0000000000000bar", "v2")); |
||||
ASSERT_OK(Put("1000000000000foo", "v3")); |
||||
ASSERT_EQ("v3", Get("1000000000000foo")); |
||||
ASSERT_EQ("v2", Get("0000000000000bar")); |
||||
} |
||||
|
||||
TEST(PlainTableDBTest, Flush) { |
||||
ASSERT_OK(Put("1000000000000foo", "v1")); |
||||
ASSERT_OK(Put("0000000000000bar", "v2")); |
||||
ASSERT_OK(Put("1000000000000foo", "v3")); |
||||
dbfull()->TEST_FlushMemTable(); |
||||
ASSERT_EQ("v3", Get("1000000000000foo")); |
||||
ASSERT_EQ("v2", Get("0000000000000bar")); |
||||
} |
||||
|
||||
TEST(PlainTableDBTest, Iterator) { |
||||
ASSERT_OK(Put("1000000000foo002", "v_2")); |
||||
ASSERT_OK(Put("0000000000000bar", "random")); |
||||
ASSERT_OK(Put("1000000000foo001", "v1")); |
||||
ASSERT_OK(Put("3000000000000bar", "bar_v")); |
||||
ASSERT_OK(Put("1000000000foo003", "v__3")); |
||||
ASSERT_OK(Put("1000000000foo004", "v__4")); |
||||
ASSERT_OK(Put("1000000000foo005", "v__5")); |
||||
ASSERT_OK(Put("1000000000foo007", "v__7")); |
||||
ASSERT_OK(Put("1000000000foo008", "v__8")); |
||||
dbfull()->TEST_FlushMemTable(); |
||||
ASSERT_EQ("v1", Get("1000000000foo001")); |
||||
ASSERT_EQ("v__3", Get("1000000000foo003")); |
||||
ReadOptions ro; |
||||
Iterator* iter = dbfull()->NewIterator(ro); |
||||
iter->Seek("1000000000foo001"); |
||||
ASSERT_TRUE(iter->Valid()); |
||||
ASSERT_EQ("1000000000foo001", iter->key().ToString()); |
||||
ASSERT_EQ("v1", iter->value().ToString()); |
||||
|
||||
iter->Next(); |
||||
ASSERT_TRUE(iter->Valid()); |
||||
ASSERT_EQ("1000000000foo002", iter->key().ToString()); |
||||
ASSERT_EQ("v_2", iter->value().ToString()); |
||||
|
||||
iter->Next(); |
||||
ASSERT_TRUE(iter->Valid()); |
||||
ASSERT_EQ("1000000000foo003", iter->key().ToString()); |
||||
ASSERT_EQ("v__3", iter->value().ToString()); |
||||
|
||||
iter->Next(); |
||||
ASSERT_TRUE(iter->Valid()); |
||||
ASSERT_EQ("1000000000foo004", iter->key().ToString()); |
||||
ASSERT_EQ("v__4", iter->value().ToString()); |
||||
|
||||
iter->Seek("3000000000000bar"); |
||||
ASSERT_TRUE(iter->Valid()); |
||||
ASSERT_EQ("3000000000000bar", iter->key().ToString()); |
||||
ASSERT_EQ("bar_v", iter->value().ToString()); |
||||
|
||||
iter->Seek("1000000000foo000"); |
||||
ASSERT_TRUE(iter->Valid()); |
||||
ASSERT_EQ("1000000000foo001", iter->key().ToString()); |
||||
ASSERT_EQ("v1", iter->value().ToString()); |
||||
|
||||
iter->Seek("1000000000foo005"); |
||||
ASSERT_TRUE(iter->Valid()); |
||||
ASSERT_EQ("1000000000foo005", iter->key().ToString()); |
||||
ASSERT_EQ("v__5", iter->value().ToString()); |
||||
|
||||
iter->Seek("1000000000foo006"); |
||||
ASSERT_TRUE(iter->Valid()); |
||||
ASSERT_EQ("1000000000foo007", iter->key().ToString()); |
||||
ASSERT_EQ("v__7", iter->value().ToString()); |
||||
|
||||
iter->Seek("1000000000foo008"); |
||||
ASSERT_TRUE(iter->Valid()); |
||||
ASSERT_EQ("1000000000foo008", iter->key().ToString()); |
||||
ASSERT_EQ("v__8", iter->value().ToString()); |
||||
|
||||
iter->Seek("1000000000foo009"); |
||||
ASSERT_TRUE(iter->Valid()); |
||||
ASSERT_EQ("3000000000000bar", iter->key().ToString()); |
||||
|
||||
|
||||
delete iter; |
||||
} |
||||
|
||||
TEST(PlainTableDBTest, Flush2) { |
||||
ASSERT_OK(Put("0000000000000bar", "b")); |
||||
ASSERT_OK(Put("1000000000000foo", "v1")); |
||||
dbfull()->TEST_FlushMemTable(); |
||||
|
||||
ASSERT_OK(Put("1000000000000foo", "v2")); |
||||
dbfull()->TEST_FlushMemTable(); |
||||
ASSERT_EQ("v2", Get("1000000000000foo")); |
||||
|
||||
ASSERT_OK(Put("0000000000000eee", "v3")); |
||||
dbfull()->TEST_FlushMemTable(); |
||||
ASSERT_EQ("v3", Get("0000000000000eee")); |
||||
|
||||
ASSERT_OK(Delete("0000000000000bar")); |
||||
dbfull()->TEST_FlushMemTable(); |
||||
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); |
||||
|
||||
ASSERT_OK(Put("0000000000000eee", "v5")); |
||||
dbfull()->TEST_FlushMemTable(); |
||||
ASSERT_EQ("v5", Get("0000000000000eee")); |
||||
} |
||||
|
||||
static std::string Key(int i) { |
||||
char buf[100]; |
||||
snprintf(buf, sizeof(buf), "key_______%06d", i); |
||||
return std::string(buf); |
||||
} |
||||
|
||||
static std::string RandomString(Random* rnd, int len) { |
||||
std::string r; |
||||
test::RandomString(rnd, len, &r); |
||||
return r; |
||||
} |
||||
|
||||
TEST(PlainTableDBTest, CompactionTrigger) { |
||||
Options options = CurrentOptions(); |
||||
options.write_buffer_size = 100 << 10; //100KB
|
||||
options.num_levels = 3; |
||||
options.max_mem_compaction_level = 0; |
||||
options.level0_file_num_compaction_trigger = 3; |
||||
Reopen(&options); |
||||
|
||||
Random rnd(301); |
||||
|
||||
for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; |
||||
num++) { |
||||
std::vector<std::string> values; |
||||
// Write 120KB (12 values, each 10K)
|
||||
for (int i = 0; i < 12; i++) { |
||||
values.push_back(RandomString(&rnd, 10000)); |
||||
ASSERT_OK(Put(Key(i), values[i])); |
||||
} |
||||
dbfull()->TEST_WaitForFlushMemTable(); |
||||
ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); |
||||
} |
||||
|
||||
//generate one more file in level-0, and should trigger level-0 compaction
|
||||
std::vector<std::string> values; |
||||
for (int i = 0; i < 12; i++) { |
||||
values.push_back(RandomString(&rnd, 10000)); |
||||
ASSERT_OK(Put(Key(i), values[i])); |
||||
} |
||||
dbfull()->TEST_WaitForCompact(); |
||||
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0); |
||||
ASSERT_EQ(NumTableFilesAtLevel(1), 1); |
||||
} |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) { |
||||
return rocksdb::test::RunAllTests(); |
||||
} |
@ -1,45 +0,0 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Arena class defines memory allocation methods. It's used by memtable and
|
||||
// skiplist.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_ |
||||
#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_ |
||||
|
||||
#include <limits> |
||||
#include <memory> |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class Arena { |
||||
public: |
||||
Arena() {}; |
||||
virtual ~Arena() {}; |
||||
|
||||
// Return a pointer to a newly allocated memory block of "bytes" bytes.
|
||||
virtual char* Allocate(size_t bytes) = 0; |
||||
|
||||
// Allocate memory with the normal alignment guarantees provided by malloc.
|
||||
virtual char* AllocateAligned(size_t bytes) = 0; |
||||
|
||||
// Returns an estimate of the total memory used by arena.
|
||||
virtual const size_t ApproximateMemoryUsage() = 0; |
||||
|
||||
// Returns the total number of bytes in all blocks allocated so far.
|
||||
virtual const size_t MemoryAllocatedBytes() = 0; |
||||
|
||||
private: |
||||
// No copying allowed
|
||||
Arena(const Arena&); |
||||
void operator=(const Arena&); |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_ARENA_H_
|
@ -1,31 +0,0 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
#include <memory> |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class FlushBlockPolicyFactory; |
||||
|
||||
struct BlockBasedTableOptions { |
||||
// @flush_block_policy_factory creates the instances of flush block policy.
|
||||
// which provides a configurable way to determine when to flush a block in
|
||||
// the block based tables. If not set, table builder will use the default
|
||||
// block flush policy, which cut blocks by block size (please refer to
|
||||
// `FlushBlockBySizePolicy`).
|
||||
std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory; |
||||
|
||||
// TODO(kailiu) Temporarily disable this feature by making the default value
|
||||
// to be false. Also in master branch, this file is non-public so no user
|
||||
// will be able to change the value of `cache_index_and_filter_blocks`.
|
||||
//
|
||||
// Indicating if we'd put index/filter blocks to the block cache.
|
||||
// If not specified, each "table reader" object will pre-load index/filter
|
||||
// block during table initialization.
|
||||
bool cache_index_and_filter_blocks = false; |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,286 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "table/meta_blocks.h" |
||||
|
||||
#include <map> |
||||
|
||||
#include "rocksdb/table.h" |
||||
#include "table/block.h" |
||||
#include "table/format.h" |
||||
#include "util/coding.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
MetaIndexBuilder::MetaIndexBuilder() |
||||
: meta_index_block_( |
||||
new BlockBuilder(1 /* restart interval */, BytewiseComparator())) { |
||||
} |
||||
|
||||
void MetaIndexBuilder::Add(const std::string& key, |
||||
const BlockHandle& handle) { |
||||
std::string handle_encoding; |
||||
handle.EncodeTo(&handle_encoding); |
||||
meta_block_handles_.insert({key, handle_encoding}); |
||||
} |
||||
|
||||
Slice MetaIndexBuilder::Finish() { |
||||
for (const auto& metablock : meta_block_handles_) { |
||||
meta_index_block_->Add(metablock.first, metablock.second); |
||||
} |
||||
return meta_index_block_->Finish(); |
||||
} |
||||
|
||||
PropertyBlockBuilder::PropertyBlockBuilder() |
||||
: properties_block_( |
||||
new BlockBuilder(1 /* restart interval */, BytewiseComparator())) { |
||||
} |
||||
|
||||
void PropertyBlockBuilder::Add(const std::string& name, |
||||
const std::string& val) { |
||||
props_.insert({name, val}); |
||||
} |
||||
|
||||
void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) { |
||||
assert(props_.find(name) == props_.end()); |
||||
|
||||
std::string dst; |
||||
PutVarint64(&dst, val); |
||||
|
||||
Add(name, dst); |
||||
} |
||||
|
||||
void PropertyBlockBuilder::Add( |
||||
const UserCollectedProperties& user_collected_properties) { |
||||
for (const auto& prop : user_collected_properties) { |
||||
Add(prop.first, prop.second); |
||||
} |
||||
} |
||||
|
||||
void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { |
||||
Add(TablePropertiesNames::kRawKeySize, props.raw_key_size); |
||||
Add(TablePropertiesNames::kRawValueSize, props.raw_value_size); |
||||
Add(TablePropertiesNames::kDataSize, props.data_size); |
||||
Add(TablePropertiesNames::kIndexSize, props.index_size); |
||||
Add(TablePropertiesNames::kNumEntries, props.num_entries); |
||||
Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks); |
||||
Add(TablePropertiesNames::kFilterSize, props.filter_size); |
||||
Add(TablePropertiesNames::kFormatVersion, props.format_version); |
||||
Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len); |
||||
|
||||
if (!props.filter_policy_name.empty()) { |
||||
Add(TablePropertiesNames::kFilterPolicy, |
||||
props.filter_policy_name); |
||||
} |
||||
} |
||||
|
||||
Slice PropertyBlockBuilder::Finish() { |
||||
for (const auto& prop : props_) { |
||||
properties_block_->Add(prop.first, prop.second); |
||||
} |
||||
|
||||
return properties_block_->Finish(); |
||||
} |
||||
|
||||
void LogPropertiesCollectionError( |
||||
Logger* info_log, const std::string& method, const std::string& name) { |
||||
assert(method == "Add" || method == "Finish"); |
||||
|
||||
std::string msg = |
||||
"[Warning] encountered error when calling TablePropertiesCollector::" + |
||||
method + "() with collector name: " + name; |
||||
Log(info_log, "%s", msg.c_str()); |
||||
} |
||||
|
||||
bool NotifyCollectTableCollectorsOnAdd( |
||||
const Slice& key, |
||||
const Slice& value, |
||||
const Options::TablePropertiesCollectors& collectors, |
||||
Logger* info_log) { |
||||
bool all_succeeded = true; |
||||
for (auto collector : collectors) { |
||||
Status s = collector->Add(key, value); |
||||
all_succeeded = all_succeeded && s.ok(); |
||||
if (!s.ok()) { |
||||
LogPropertiesCollectionError( |
||||
info_log, "Add", /* method */ collector->Name() |
||||
); |
||||
} |
||||
} |
||||
return all_succeeded; |
||||
} |
||||
|
||||
bool NotifyCollectTableCollectorsOnFinish( |
||||
const Options::TablePropertiesCollectors& collectors, |
||||
Logger* info_log, |
||||
PropertyBlockBuilder* builder) { |
||||
bool all_succeeded = true; |
||||
for (auto collector : collectors) { |
||||
UserCollectedProperties user_collected_properties; |
||||
Status s = collector->Finish(&user_collected_properties); |
||||
|
||||
all_succeeded = all_succeeded && s.ok(); |
||||
if (!s.ok()) { |
||||
LogPropertiesCollectionError( |
||||
info_log, "Finish", /* method */ collector->Name() |
||||
); |
||||
} else { |
||||
builder->Add(user_collected_properties); |
||||
} |
||||
} |
||||
|
||||
return all_succeeded; |
||||
} |
||||
|
||||
Status ReadProperties( |
||||
const Slice& handle_value, |
||||
RandomAccessFile* file, |
||||
Env* env, |
||||
Logger* logger, |
||||
TableProperties* table_properties) { |
||||
assert(table_properties); |
||||
|
||||
Slice v = handle_value; |
||||
BlockHandle handle; |
||||
if (!handle.DecodeFrom(&v).ok()) { |
||||
return Status::InvalidArgument("Failed to decode properties block handle"); |
||||
} |
||||
|
||||
BlockContents block_contents; |
||||
ReadOptions read_options; |
||||
read_options.verify_checksums = false; |
||||
Status s = ReadBlockContents( |
||||
file, |
||||
read_options, |
||||
handle, |
||||
&block_contents, |
||||
env, |
||||
false |
||||
); |
||||
|
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
|
||||
Block properties_block(block_contents); |
||||
std::unique_ptr<Iterator> iter( |
||||
properties_block.NewIterator(BytewiseComparator()) |
||||
); |
||||
|
||||
// All pre-defined properties of type uint64_t
|
||||
std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = { |
||||
{ TablePropertiesNames::kDataSize, &table_properties->data_size }, |
||||
{ TablePropertiesNames::kIndexSize, &table_properties->index_size }, |
||||
{ TablePropertiesNames::kFilterSize, &table_properties->filter_size }, |
||||
{ TablePropertiesNames::kRawKeySize, &table_properties->raw_key_size }, |
||||
{ TablePropertiesNames::kRawValueSize, &table_properties->raw_value_size }, |
||||
{ TablePropertiesNames::kNumDataBlocks, |
||||
&table_properties->num_data_blocks }, |
||||
{ TablePropertiesNames::kNumEntries, &table_properties->num_entries }, |
||||
{ TablePropertiesNames::kFormatVersion, &table_properties->format_version }, |
||||
{ TablePropertiesNames::kFixedKeyLen, &table_properties->fixed_key_len }, |
||||
}; |
||||
|
||||
std::string last_key; |
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { |
||||
s = iter->status(); |
||||
if (!s.ok()) { |
||||
break; |
||||
} |
||||
|
||||
auto key = iter->key().ToString(); |
||||
// properties block is strictly sorted with no duplicate key.
|
||||
assert( |
||||
last_key.empty() || |
||||
BytewiseComparator()->Compare(key, last_key) > 0 |
||||
); |
||||
last_key = key; |
||||
|
||||
auto raw_val = iter->value(); |
||||
auto pos = predefined_uint64_properties.find(key); |
||||
|
||||
if (pos != predefined_uint64_properties.end()) { |
||||
// handle predefined rocksdb properties
|
||||
uint64_t val; |
||||
if (!GetVarint64(&raw_val, &val)) { |
||||
// skip malformed value
|
||||
auto error_msg = |
||||
"[Warning] detect malformed value in properties meta-block:" |
||||
"\tkey: " + key + "\tval: " + raw_val.ToString(); |
||||
Log(logger, "%s", error_msg.c_str()); |
||||
continue; |
||||
} |
||||
*(pos->second) = val; |
||||
} else if (key == TablePropertiesNames::kFilterPolicy) { |
||||
table_properties->filter_policy_name = raw_val.ToString(); |
||||
} else { |
||||
// handle user-collected properties
|
||||
table_properties->user_collected_properties.insert( |
||||
std::make_pair(key, raw_val.ToString()) |
||||
); |
||||
} |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status ReadTableProperties( |
||||
RandomAccessFile* file, |
||||
uint64_t file_size, |
||||
uint64_t table_magic_number, |
||||
Env* env, |
||||
Logger* info_log, |
||||
TableProperties* properties) { |
||||
// -- Read metaindex block
|
||||
Footer footer(table_magic_number); |
||||
auto s = ReadFooterFromFile(file, file_size, &footer); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
|
||||
auto metaindex_handle = footer.metaindex_handle(); |
||||
BlockContents metaindex_contents; |
||||
ReadOptions read_options; |
||||
read_options.verify_checksums = false; |
||||
s = ReadBlockContents( |
||||
file, |
||||
read_options, |
||||
metaindex_handle, |
||||
&metaindex_contents, |
||||
env, |
||||
false |
||||
); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
Block metaindex_block(metaindex_contents); |
||||
std::unique_ptr<Iterator> meta_iter( |
||||
metaindex_block.NewIterator(BytewiseComparator()) |
||||
); |
||||
|
||||
// -- Read property block
|
||||
meta_iter->Seek(kPropertiesBlock); |
||||
TableProperties table_properties; |
||||
if (meta_iter->Valid() && |
||||
meta_iter->key() == kPropertiesBlock && |
||||
meta_iter->status().ok()) { |
||||
s = ReadProperties( |
||||
meta_iter->value(), |
||||
file, |
||||
env, |
||||
info_log, |
||||
properties |
||||
); |
||||
} else { |
||||
s = Status::Corruption( |
||||
"Unable to read the property block from the plain table" |
||||
); |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,121 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once |
||||
|
||||
#include <map> |
||||
#include <memory> |
||||
#include <string> |
||||
|
||||
#include "rocksdb/comparator.h" |
||||
#include "rocksdb/options.h" |
||||
#include "rocksdb/slice.h" |
||||
#include "rocksdb/table_properties.h" |
||||
#include "table/block_builder.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class BlockBuilder; |
||||
class BlockHandle; |
||||
class Env; |
||||
class Logger; |
||||
class RandomAccessFile; |
||||
struct TableProperties; |
||||
|
||||
// An STL style comparator that does the bytewise comparator comparasion
|
||||
// internally.
|
||||
struct BytewiseLessThan { |
||||
bool operator()(const std::string& key1, const std::string& key2) const { |
||||
// smaller entries will be placed in front.
|
||||
return comparator->Compare(key1, key2) <= 0; |
||||
} |
||||
|
||||
const Comparator* comparator = BytewiseComparator(); |
||||
}; |
||||
|
||||
// When writing to a block that requires entries to be sorted by
|
||||
// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
|
||||
// before writng to store.
|
||||
typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap; |
||||
|
||||
class MetaIndexBuilder { |
||||
public: |
||||
MetaIndexBuilder(const MetaIndexBuilder&) = delete; |
||||
MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete; |
||||
|
||||
MetaIndexBuilder(); |
||||
void Add(const std::string& key, const BlockHandle& handle); |
||||
|
||||
// Write all the added key/value pairs to the block and return the contents
|
||||
// of the block.
|
||||
Slice Finish(); |
||||
|
||||
private: |
||||
// store the sorted key/handle of the metablocks.
|
||||
BytewiseSortedMap meta_block_handles_; |
||||
std::unique_ptr<BlockBuilder> meta_index_block_; |
||||
}; |
||||
|
||||
class PropertyBlockBuilder { |
||||
public: |
||||
PropertyBlockBuilder(const PropertyBlockBuilder&) = delete; |
||||
PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete; |
||||
|
||||
PropertyBlockBuilder(); |
||||
|
||||
void AddTableProperty(const TableProperties& props); |
||||
void Add(const std::string& key, uint64_t value); |
||||
void Add(const std::string& key, const std::string& value); |
||||
void Add(const UserCollectedProperties& user_collected_properties); |
||||
|
||||
// Write all the added entries to the block and return the block contents
|
||||
Slice Finish(); |
||||
|
||||
private: |
||||
std::unique_ptr<BlockBuilder> properties_block_; |
||||
BytewiseSortedMap props_; |
||||
}; |
||||
|
||||
// Were we encounter any error occurs during user-defined statistics collection,
|
||||
// we'll write the warning message to info log.
|
||||
void LogPropertiesCollectionError( |
||||
Logger* info_log, const std::string& method, const std::string& name); |
||||
|
||||
// Utility functions help table builder to trigger batch events for user
|
||||
// defined property collectors.
|
||||
// Return value indicates if there is any error occurred; if error occurred,
|
||||
// the warning message will be logged.
|
||||
// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all
|
||||
// property collectors.
|
||||
bool NotifyCollectTableCollectorsOnAdd( |
||||
const Slice& key, |
||||
const Slice& value, |
||||
const Options::TablePropertiesCollectors& collectors, |
||||
Logger* info_log); |
||||
|
||||
// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all
|
||||
// property collectors. The collected properties will be added to `builder`.
|
||||
bool NotifyCollectTableCollectorsOnFinish( |
||||
const Options::TablePropertiesCollectors& collectors, |
||||
Logger* info_log, |
||||
PropertyBlockBuilder* builder); |
||||
|
||||
// Read the properties from the table.
|
||||
Status ReadProperties( |
||||
const Slice& handle_value, |
||||
RandomAccessFile* file, |
||||
Env* env, |
||||
Logger* logger, |
||||
TableProperties* table_properties); |
||||
|
||||
// Directly read the properties from the properties block of a plain table.
|
||||
Status ReadTableProperties( |
||||
RandomAccessFile* file, |
||||
uint64_t file_size, |
||||
uint64_t table_magic_number, |
||||
Env* env, |
||||
Logger* info_log, |
||||
TableProperties* properties); |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,198 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "table/plain_table_builder.h" |
||||
|
||||
#include <assert.h> |
||||
#include <map> |
||||
|
||||
#include "rocksdb/comparator.h" |
||||
#include "rocksdb/env.h" |
||||
#include "rocksdb/filter_policy.h" |
||||
#include "rocksdb/options.h" |
||||
#include "table/plain_table_factory.h" |
||||
#include "db/dbformat.h" |
||||
#include "table/block_builder.h" |
||||
#include "table/filter_block.h" |
||||
#include "table/format.h" |
||||
#include "table/meta_blocks.h" |
||||
#include "util/coding.h" |
||||
#include "util/crc32c.h" |
||||
#include "util/stop_watch.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
namespace { |
||||
|
||||
// a utility that helps writing block content to the file
|
||||
// @offset will advance if @block_contents was successfully written.
|
||||
// @block_handle the block handle this particular block.
|
||||
Status WriteBlock( |
||||
const Slice& block_contents, |
||||
WritableFile* file, |
||||
uint64_t* offset, |
||||
BlockHandle* block_handle) { |
||||
block_handle->set_offset(*offset); |
||||
block_handle->set_size(block_contents.size()); |
||||
Status s = file->Append(block_contents); |
||||
|
||||
if (s.ok()) { |
||||
*offset += block_contents.size(); |
||||
} |
||||
return s; |
||||
} |
||||
|
||||
} // namespace
|
||||
|
||||
// kPlainTableMagicNumber was picked by running
|
||||
// echo rocksdb.plain.table | sha1sum
|
||||
// and taking the leading 64 bits.
|
||||
extern const uint64_t kPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; |
||||
|
||||
PlainTableBuilder::PlainTableBuilder(const Options& options, |
||||
WritableFile* file, |
||||
uint32_t user_key_len) : |
||||
options_(options), file_(file), user_key_len_(user_key_len) { |
||||
properties_.fixed_key_len = user_key_len; |
||||
|
||||
// for plain table, we put all the data in a big chuck.
|
||||
properties_.num_data_blocks = 1; |
||||
// emphasize that currently plain table doesn't have persistent index or
|
||||
// filter block.
|
||||
properties_.index_size = 0; |
||||
properties_.filter_size = 0; |
||||
properties_.format_version = 0; |
||||
} |
||||
|
||||
PlainTableBuilder::~PlainTableBuilder() { |
||||
} |
||||
|
||||
void PlainTableBuilder::Add(const Slice& key, const Slice& value) { |
||||
size_t user_key_size = key.size() - 8; |
||||
assert(user_key_len_ == 0 || user_key_size == user_key_len_); |
||||
|
||||
if (!IsFixedLength()) { |
||||
// Write key length
|
||||
key_size_str_.clear(); |
||||
PutVarint32(&key_size_str_, user_key_size); |
||||
file_->Append(key_size_str_); |
||||
offset_ += key_size_str_.length(); |
||||
} |
||||
|
||||
// Write key
|
||||
ParsedInternalKey parsed_key; |
||||
if (!ParseInternalKey(key, &parsed_key)) { |
||||
status_ = Status::Corruption(Slice()); |
||||
return; |
||||
} |
||||
if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) { |
||||
file_->Append(Slice(key.data(), user_key_size)); |
||||
char tmp_char = PlainTableFactory::kValueTypeSeqId0; |
||||
file_->Append(Slice(&tmp_char, 1)); |
||||
offset_ += key.size() - 7; |
||||
} else { |
||||
file_->Append(key); |
||||
offset_ += key.size(); |
||||
} |
||||
|
||||
// Write value length
|
||||
value_size_str_.clear(); |
||||
int value_size = value.size(); |
||||
PutVarint32(&value_size_str_, value_size); |
||||
file_->Append(value_size_str_); |
||||
|
||||
// Write value
|
||||
file_->Append(value); |
||||
offset_ += value_size + value_size_str_.length(); |
||||
|
||||
properties_.num_entries++; |
||||
properties_.raw_key_size += key.size(); |
||||
properties_.raw_value_size += value.size(); |
||||
|
||||
// notify property collectors
|
||||
NotifyCollectTableCollectorsOnAdd( |
||||
key, |
||||
value, |
||||
options_.table_properties_collectors, |
||||
options_.info_log.get() |
||||
); |
||||
} |
||||
|
||||
Status PlainTableBuilder::status() const { return status_; } |
||||
|
||||
Status PlainTableBuilder::Finish() { |
||||
assert(!closed_); |
||||
closed_ = true; |
||||
|
||||
properties_.data_size = offset_; |
||||
|
||||
// Write the following blocks
|
||||
// 1. [meta block: properties]
|
||||
// 2. [metaindex block]
|
||||
// 3. [footer]
|
||||
MetaIndexBuilder meta_index_builer; |
||||
|
||||
PropertyBlockBuilder property_block_builder; |
||||
// -- Add basic properties
|
||||
property_block_builder.AddTableProperty(properties_); |
||||
|
||||
// -- Add user collected properties
|
||||
NotifyCollectTableCollectorsOnFinish( |
||||
options_.table_properties_collectors, |
||||
options_.info_log.get(), |
||||
&property_block_builder |
||||
); |
||||
|
||||
// -- Write property block
|
||||
BlockHandle property_block_handle; |
||||
auto s = WriteBlock( |
||||
property_block_builder.Finish(), |
||||
file_, |
||||
&offset_, |
||||
&property_block_handle |
||||
); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
meta_index_builer.Add(kPropertiesBlock, property_block_handle); |
||||
|
||||
// -- write metaindex block
|
||||
BlockHandle metaindex_block_handle; |
||||
s = WriteBlock( |
||||
meta_index_builer.Finish(), |
||||
file_, |
||||
&offset_, |
||||
&metaindex_block_handle |
||||
); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
|
||||
// Write Footer
|
||||
Footer footer(kPlainTableMagicNumber); |
||||
footer.set_metaindex_handle(metaindex_block_handle); |
||||
footer.set_index_handle(BlockHandle::NullBlockHandle()); |
||||
std::string footer_encoding; |
||||
footer.EncodeTo(&footer_encoding); |
||||
s = file_->Append(footer_encoding); |
||||
if (s.ok()) { |
||||
offset_ += footer_encoding.size(); |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
void PlainTableBuilder::Abandon() { |
||||
closed_ = true; |
||||
} |
||||
|
||||
uint64_t PlainTableBuilder::NumEntries() const { |
||||
return properties_.num_entries; |
||||
} |
||||
|
||||
uint64_t PlainTableBuilder::FileSize() const { |
||||
return offset_; |
||||
} |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,85 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// IndexedTable is a simple table format for UNIT TEST ONLY. It is not built
|
||||
// as production quality.
|
||||
|
||||
#pragma once |
||||
#include <stdint.h> |
||||
#include "rocksdb/options.h" |
||||
#include "rocksdb/status.h" |
||||
#include "table/table_builder.h" |
||||
#include "rocksdb/table_properties.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class BlockBuilder; |
||||
class BlockHandle; |
||||
class WritableFile; |
||||
class TableBuilder; |
||||
|
||||
class PlainTableBuilder: public TableBuilder { |
||||
public: |
||||
// Create a builder that will store the contents of the table it is
|
||||
// building in *file. Does not close the file. It is up to the
|
||||
// caller to close the file after calling Finish(). The output file
|
||||
// will be part of level specified by 'level'. A value of -1 means
|
||||
// that the caller does not know which level the output file will reside.
|
||||
PlainTableBuilder(const Options& options, WritableFile* file, |
||||
uint32_t user_key_size); |
||||
|
||||
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||
~PlainTableBuilder(); |
||||
|
||||
// Add key,value to the table being constructed.
|
||||
// REQUIRES: key is after any previously added key according to comparator.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
void Add(const Slice& key, const Slice& value) override; |
||||
|
||||
// Return non-ok iff some error has been detected.
|
||||
Status status() const override; |
||||
|
||||
// Finish building the table. Stops using the file passed to the
|
||||
// constructor after this function returns.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
Status Finish() override; |
||||
|
||||
// Indicate that the contents of this builder should be abandoned. Stops
|
||||
// using the file passed to the constructor after this function returns.
|
||||
// If the caller is not going to call Finish(), it must call Abandon()
|
||||
// before destroying this builder.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
void Abandon() override; |
||||
|
||||
// Number of calls to Add() so far.
|
||||
uint64_t NumEntries() const override; |
||||
|
||||
// Size of the file generated so far. If invoked after a successful
|
||||
// Finish() call, returns the size of the final generated file.
|
||||
uint64_t FileSize() const override; |
||||
|
||||
private: |
||||
Options options_; |
||||
WritableFile* file_; |
||||
uint64_t offset_ = 0; |
||||
Status status_; |
||||
TableProperties properties_; |
||||
|
||||
const size_t user_key_len_; |
||||
bool closed_ = false; // Either Finish() or Abandon() has been called.
|
||||
|
||||
std::string key_size_str_; |
||||
std::string value_size_str_; |
||||
|
||||
bool IsFixedLength() const { |
||||
return user_key_len_ > 0; |
||||
} |
||||
|
||||
// No copying allowed
|
||||
PlainTableBuilder(const PlainTableBuilder&) = delete; |
||||
void operator=(const PlainTableBuilder&) = delete; |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -0,0 +1,40 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "table/plain_table_factory.h" |
||||
|
||||
#include <memory> |
||||
#include <stdint.h> |
||||
#include "db/dbformat.h" |
||||
#include "table/plain_table_builder.h" |
||||
#include "table/plain_table_reader.h" |
||||
#include "port/port.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
Status PlainTableFactory::NewTableReader(const Options& options, |
||||
const EnvOptions& soptions, |
||||
const InternalKeyComparator& icomp, |
||||
unique_ptr<RandomAccessFile>&& file, |
||||
uint64_t file_size, |
||||
unique_ptr<TableReader>* table) const { |
||||
return PlainTableReader::Open(options, soptions, icomp, std::move(file), |
||||
file_size, table, bloom_bits_per_key_, |
||||
hash_table_ratio_); |
||||
} |
||||
|
||||
TableBuilder* PlainTableFactory::NewTableBuilder( |
||||
const Options& options, const InternalKeyComparator& internal_comparator, |
||||
WritableFile* file, CompressionType compression_type) const { |
||||
return new PlainTableBuilder(options, file, user_key_len_); |
||||
} |
||||
|
||||
extern TableFactory* NewPlainTableFactory(uint32_t user_key_len, |
||||
int bloom_bits_per_key, |
||||
double hash_table_ratio) { |
||||
return new PlainTableFactory(user_key_len, bloom_bits_per_key, |
||||
hash_table_ratio); |
||||
} |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,76 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once |
||||
#include <memory> |
||||
#include <stdint.h> |
||||
|
||||
#include "rocksdb/options.h" |
||||
#include "rocksdb/table.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
struct Options; |
||||
struct EnvOptions; |
||||
|
||||
using std::unique_ptr; |
||||
class Status; |
||||
class RandomAccessFile; |
||||
class WritableFile; |
||||
class Table; |
||||
class TableBuilder; |
||||
|
||||
// IndexedTable requires fixed length key, configured as a constructor
|
||||
// parameter of the factory class. Output file format:
|
||||
// +-------------+-----------------+
|
||||
// | version | user_key_length |
|
||||
// +------------++------------------------------+ <= key1 offset
|
||||
// | [key_size] | key1 | value_size | |
|
||||
// +------------+-------------+-------------+ |
|
||||
// | value1 |
|
||||
// | |
|
||||
// +----------------------------------------+---+ <= key2 offset
|
||||
// | [key_size] | key2 | value_size | |
|
||||
// +------------+-------------+-------------+ |
|
||||
// | value2 |
|
||||
// | |
|
||||
// | ...... |
|
||||
// +-----------------+--------------------------+
|
||||
// If user_key_length = kPlainTableVariableLength, it means the key is variable
|
||||
// length, there will be an extra field for key size encoded before every key.
|
||||
class PlainTableFactory : public TableFactory { |
||||
public: |
||||
~PlainTableFactory() {} |
||||
// user_key_size is the length of the user key. If it is set to be
|
||||
// kPlainTableVariableLength, then it means variable length. Otherwise, all
|
||||
// the keys need to have the fix length of this value. bloom_bits_per_key is
|
||||
// number of bits used for bloom filer per key. hash_table_ratio is
|
||||
// the desired utilization of the hash table used for prefix hashing.
|
||||
// hash_table_ratio = number of prefixes / #buckets in the hash table
|
||||
explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength, |
||||
int bloom_bits_per_key = 0, |
||||
double hash_table_ratio = 0.75) |
||||
: user_key_len_(user_key_len), |
||||
bloom_bits_per_key_(bloom_bits_per_key), |
||||
hash_table_ratio_(hash_table_ratio) {} |
||||
const char* Name() const override { return "PlainTable"; } |
||||
Status NewTableReader(const Options& options, const EnvOptions& soptions, |
||||
const InternalKeyComparator& internal_comparator, |
||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size, |
||||
unique_ptr<TableReader>* table) const override; |
||||
TableBuilder* NewTableBuilder(const Options& options, |
||||
const InternalKeyComparator& icomparator, |
||||
WritableFile* file, |
||||
CompressionType compression_type) const |
||||
override; |
||||
|
||||
static const char kValueTypeSeqId0 = 0xFF; |
||||
|
||||
private: |
||||
uint32_t user_key_len_; |
||||
int bloom_bits_per_key_; |
||||
double hash_table_ratio_; |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,695 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "table/plain_table_reader.h" |
||||
|
||||
#include <string> |
||||
|
||||
#include "db/dbformat.h" |
||||
|
||||
#include "rocksdb/cache.h" |
||||
#include "rocksdb/comparator.h" |
||||
#include "rocksdb/env.h" |
||||
#include "rocksdb/filter_policy.h" |
||||
#include "rocksdb/options.h" |
||||
#include "rocksdb/statistics.h" |
||||
|
||||
#include "table/block.h" |
||||
#include "table/filter_block.h" |
||||
#include "table/format.h" |
||||
#include "table/meta_blocks.h" |
||||
#include "table/two_level_iterator.h" |
||||
#include "table/plain_table_factory.h" |
||||
|
||||
#include "util/coding.h" |
||||
#include "util/dynamic_bloom.h" |
||||
#include "util/hash.h" |
||||
#include "util/histogram.h" |
||||
#include "util/murmurhash.h" |
||||
#include "util/perf_context_imp.h" |
||||
#include "util/stop_watch.h" |
||||
|
||||
|
||||
namespace rocksdb { |
||||
|
||||
namespace { |
||||
|
||||
inline uint32_t GetSliceHash(Slice const& s) { |
||||
return Hash(s.data(), s.size(), 397) ; |
||||
} |
||||
|
||||
inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { |
||||
return hash % num_buckets; |
||||
} |
||||
|
||||
} // namespace
|
||||
|
||||
// Iterator to iterate IndexedTable
|
||||
class PlainTableIterator : public Iterator { |
||||
public: |
||||
explicit PlainTableIterator(PlainTableReader* table); |
||||
~PlainTableIterator(); |
||||
|
||||
bool Valid() const; |
||||
|
||||
void SeekToFirst(); |
||||
|
||||
void SeekToLast(); |
||||
|
||||
void Seek(const Slice& target); |
||||
|
||||
void Next(); |
||||
|
||||
void Prev(); |
||||
|
||||
Slice key() const; |
||||
|
||||
Slice value() const; |
||||
|
||||
Status status() const; |
||||
|
||||
private: |
||||
PlainTableReader* table_; |
||||
uint32_t offset_; |
||||
uint32_t next_offset_; |
||||
Slice key_; |
||||
Slice value_; |
||||
Status status_; |
||||
std::string tmp_str_; |
||||
// No copying allowed
|
||||
PlainTableIterator(const PlainTableIterator&) = delete; |
||||
void operator=(const Iterator&) = delete; |
||||
}; |
||||
|
||||
extern const uint64_t kPlainTableMagicNumber; |
||||
PlainTableReader::PlainTableReader(const EnvOptions& storage_options, |
||||
const InternalKeyComparator& icomparator, |
||||
uint64_t file_size, int bloom_bits_per_key, |
||||
double hash_table_ratio, |
||||
const TableProperties& table_properties) |
||||
: soptions_(storage_options), |
||||
internal_comparator_(icomparator), |
||||
file_size_(file_size), |
||||
kHashTableRatio(hash_table_ratio), |
||||
kBloomBitsPerKey(bloom_bits_per_key), |
||||
table_properties_(table_properties), |
||||
data_end_offset_(table_properties_.data_size), |
||||
user_key_len_(table_properties.fixed_key_len) {} |
||||
|
||||
PlainTableReader::~PlainTableReader() { |
||||
delete[] hash_table_; |
||||
delete[] sub_index_; |
||||
delete bloom_; |
||||
} |
||||
|
||||
Status PlainTableReader::Open(const Options& options, |
||||
const EnvOptions& soptions, |
||||
const InternalKeyComparator& internal_comparator, |
||||
unique_ptr<RandomAccessFile>&& file, |
||||
uint64_t file_size, |
||||
unique_ptr<TableReader>* table_reader, |
||||
const int bloom_bits_per_key, |
||||
double hash_table_ratio) { |
||||
assert(options.allow_mmap_reads); |
||||
|
||||
if (file_size > kMaxFileSize) { |
||||
return Status::NotSupported("File is too large for PlainTableReader!"); |
||||
} |
||||
|
||||
TableProperties table_properties; |
||||
auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, |
||||
options.env, options.info_log.get(), |
||||
&table_properties); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
|
||||
std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader( |
||||
soptions, internal_comparator, file_size, bloom_bits_per_key, |
||||
hash_table_ratio, table_properties)); |
||||
new_reader->file_ = std::move(file); |
||||
new_reader->options_ = options; |
||||
|
||||
// -- Populate Index
|
||||
s = new_reader->PopulateIndex(); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
|
||||
*table_reader = std::move(new_reader); |
||||
return s; |
||||
} |
||||
|
||||
void PlainTableReader::SetupForCompaction() { |
||||
} |
||||
|
||||
bool PlainTableReader::PrefixMayMatch(const Slice& internal_prefix) { |
||||
return true; |
||||
} |
||||
|
||||
Iterator* PlainTableReader::NewIterator(const ReadOptions& options) { |
||||
return new PlainTableIterator(this); |
||||
} |
||||
|
||||
struct PlainTableReader::IndexRecord { |
||||
uint32_t hash; // hash of the prefix
|
||||
uint32_t offset; // offset of a row
|
||||
IndexRecord* next; |
||||
}; |
||||
|
||||
// Helper class to track all the index records
|
||||
class PlainTableReader::IndexRecordList { |
||||
public: |
||||
explicit IndexRecordList(size_t num_records_per_group) |
||||
: kNumRecordsPerGroup(num_records_per_group), |
||||
current_group_(nullptr), |
||||
num_records_in_current_group_(num_records_per_group) {} |
||||
|
||||
~IndexRecordList() { |
||||
for (size_t i = 0; i < groups_.size(); i++) { |
||||
delete[] groups_[i]; |
||||
} |
||||
} |
||||
|
||||
void AddRecord(murmur_t hash, uint32_t offset) { |
||||
if (num_records_in_current_group_ == kNumRecordsPerGroup) { |
||||
current_group_ = AllocateNewGroup(); |
||||
num_records_in_current_group_ = 0; |
||||
} |
||||
auto& new_record = current_group_[num_records_in_current_group_++]; |
||||
new_record.hash = hash; |
||||
new_record.offset = offset; |
||||
new_record.next = nullptr; |
||||
} |
||||
|
||||
size_t GetNumRecords() const { |
||||
return (groups_.size() - 1) * kNumRecordsPerGroup + |
||||
num_records_in_current_group_; |
||||
} |
||||
IndexRecord* At(size_t index) { |
||||
return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]); |
||||
} |
||||
|
||||
private: |
||||
IndexRecord* AllocateNewGroup() { |
||||
IndexRecord* result = new IndexRecord[kNumRecordsPerGroup]; |
||||
groups_.push_back(result); |
||||
return result; |
||||
} |
||||
|
||||
const size_t kNumRecordsPerGroup; |
||||
IndexRecord* current_group_; |
||||
// List of arrays allocated
|
||||
std::vector<IndexRecord*> groups_; |
||||
size_t num_records_in_current_group_; |
||||
}; |
||||
|
||||
int PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list) { |
||||
Slice prev_key_prefix_slice; |
||||
uint32_t prev_key_prefix_hash = 0; |
||||
uint32_t pos = data_start_offset_; |
||||
int key_index_within_prefix = 0; |
||||
bool is_first_record = true; |
||||
HistogramImpl keys_per_prefix_hist; |
||||
// Need map to be ordered to make sure sub indexes generated
|
||||
// are in order.
|
||||
|
||||
int num_prefixes = 0; |
||||
while (pos < data_end_offset_) { |
||||
uint32_t key_offset = pos; |
||||
ParsedInternalKey key; |
||||
Slice value_slice; |
||||
status_ = Next(pos, &key, &value_slice, pos); |
||||
Slice key_prefix_slice = GetPrefix(key); |
||||
|
||||
if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { |
||||
++num_prefixes; |
||||
if (!is_first_record) { |
||||
keys_per_prefix_hist.Add(key_index_within_prefix); |
||||
} |
||||
key_index_within_prefix = 0; |
||||
prev_key_prefix_slice = key_prefix_slice; |
||||
prev_key_prefix_hash = GetSliceHash(key_prefix_slice); |
||||
} |
||||
|
||||
if (key_index_within_prefix++ % kIndexIntervalForSamePrefixKeys == 0) { |
||||
// Add an index key for every kIndexIntervalForSamePrefixKeys keys
|
||||
record_list->AddRecord(prev_key_prefix_hash, key_offset); |
||||
} |
||||
is_first_record = false; |
||||
} |
||||
|
||||
keys_per_prefix_hist.Add(key_index_within_prefix); |
||||
Log(options_.info_log, "Number of Keys per prefix Histogram: %s", |
||||
keys_per_prefix_hist.ToString().c_str()); |
||||
|
||||
return num_prefixes; |
||||
} |
||||
|
||||
void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) { |
||||
delete[] hash_table_; |
||||
|
||||
if (kBloomBitsPerKey > 0) { |
||||
bloom_ = new DynamicBloom(num_prefixes * kBloomBitsPerKey); |
||||
} |
||||
double hash_table_size_multipier = |
||||
(kHashTableRatio > 1.0) ? 1.0 : 1.0 / kHashTableRatio; |
||||
hash_table_size_ = num_prefixes * hash_table_size_multipier + 1; |
||||
hash_table_ = new uint32_t[hash_table_size_]; |
||||
} |
||||
|
||||
size_t PlainTableReader::BucketizeIndexesAndFillBloom( |
||||
IndexRecordList& record_list, int num_prefixes, |
||||
std::vector<IndexRecord*>* hash_to_offsets, |
||||
std::vector<uint32_t>* bucket_count) { |
||||
size_t sub_index_size_needed = 0; |
||||
bool first = true; |
||||
uint32_t prev_hash = 0; |
||||
size_t num_records = record_list.GetNumRecords(); |
||||
for (size_t i = 0; i < num_records; i++) { |
||||
IndexRecord* index_record = record_list.At(i); |
||||
uint32_t cur_hash = index_record->hash; |
||||
if (first || prev_hash != cur_hash) { |
||||
prev_hash = cur_hash; |
||||
first = false; |
||||
if (bloom_) { |
||||
bloom_->AddHash(cur_hash); |
||||
} |
||||
} |
||||
uint32_t bucket = GetBucketIdFromHash(cur_hash, hash_table_size_); |
||||
IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; |
||||
index_record->next = prev_bucket_head; |
||||
(*hash_to_offsets)[bucket] = index_record; |
||||
auto& item_count = (*bucket_count)[bucket]; |
||||
if (item_count > 0) { |
||||
if (item_count == 1) { |
||||
sub_index_size_needed += kOffsetLen + 1; |
||||
} |
||||
if (item_count == 127) { |
||||
// Need more than one byte for length
|
||||
sub_index_size_needed++; |
||||
} |
||||
sub_index_size_needed += kOffsetLen; |
||||
} |
||||
item_count++; |
||||
} |
||||
return sub_index_size_needed; |
||||
} |
||||
|
||||
void PlainTableReader::FillIndexes( |
||||
size_t sub_index_size_needed, |
||||
const std::vector<IndexRecord*>& hash_to_offsets, |
||||
const std::vector<uint32_t>& bucket_count) { |
||||
Log(options_.info_log, "Reserving %zu bytes for sub index", |
||||
sub_index_size_needed); |
||||
// 8 bytes buffer for variable length size
|
||||
size_t buffer_size = 8 * 8; |
||||
size_t buffer_used = 0; |
||||
sub_index_size_needed += buffer_size; |
||||
sub_index_ = new char[sub_index_size_needed]; |
||||
size_t sub_index_offset = 0; |
||||
char* prev_ptr; |
||||
char* cur_ptr; |
||||
uint32_t* sub_index_ptr; |
||||
for (int i = 0; i < hash_table_size_; i++) { |
||||
uint32_t num_keys_for_bucket = bucket_count[i]; |
||||
switch (num_keys_for_bucket) { |
||||
case 0: |
||||
// No key for bucket
|
||||
hash_table_[i] = data_end_offset_; |
||||
break; |
||||
case 1: |
||||
// point directly to the file offset
|
||||
hash_table_[i] = hash_to_offsets[i]->offset; |
||||
break; |
||||
default: |
||||
// point to second level indexes.
|
||||
hash_table_[i] = sub_index_offset | kSubIndexMask; |
||||
prev_ptr = sub_index_ + sub_index_offset; |
||||
cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); |
||||
sub_index_offset += (cur_ptr - prev_ptr); |
||||
if (cur_ptr - prev_ptr > 2 |
||||
|| (cur_ptr - prev_ptr == 2 && num_keys_for_bucket <= 127)) { |
||||
// Need to resize sub_index. Exponentially grow buffer.
|
||||
buffer_used += cur_ptr - prev_ptr - 1; |
||||
if (buffer_used + 4 > buffer_size) { |
||||
Log(options_.info_log, "Recalculate suffix_map length to %zu", |
||||
sub_index_size_needed); |
||||
|
||||
sub_index_size_needed += buffer_size; |
||||
buffer_size *= 2; |
||||
char* new_sub_index = new char[sub_index_size_needed]; |
||||
memcpy(new_sub_index, sub_index_, sub_index_offset); |
||||
delete[] sub_index_; |
||||
sub_index_ = new_sub_index; |
||||
} |
||||
} |
||||
sub_index_ptr = (uint32_t*) (sub_index_ + sub_index_offset); |
||||
IndexRecord* record = hash_to_offsets[i]; |
||||
int j; |
||||
for (j = num_keys_for_bucket - 1; j >= 0 && record; |
||||
j--, record = record->next) { |
||||
sub_index_ptr[j] = record->offset; |
||||
} |
||||
assert(j == -1 && record == nullptr); |
||||
sub_index_offset += kOffsetLen * num_keys_for_bucket; |
||||
break; |
||||
} |
||||
} |
||||
|
||||
Log(options_.info_log, "hash table size: %d, suffix_map length %zu", |
||||
hash_table_size_, sub_index_size_needed); |
||||
} |
||||
|
||||
Status PlainTableReader::PopulateIndex() { |
||||
// Get mmapped memory to file_data_.
|
||||
Status s = file_->Read(0, file_size_, &file_data_, nullptr); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
|
||||
IndexRecordList record_list(kRecordsPerGroup); |
||||
// First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
|
||||
// for a prefix (starting from the first one), generate a record of (hash,
|
||||
// offset) and append it to IndexRecordList, which is a data structure created
|
||||
// to store them.
|
||||
int num_prefixes = PopulateIndexRecordList(&record_list); |
||||
// Calculated hash table and bloom filter size and allocate memory for indexes
|
||||
// and bloom filter based on the number of prefixes.
|
||||
AllocateIndexAndBloom(num_prefixes); |
||||
|
||||
// Bucketize all the index records to a temp data structure, in which for
|
||||
// each bucket, we generate a linked list of IndexRecord, in reversed order.
|
||||
std::vector<IndexRecord*> hash_to_offsets(hash_table_size_, nullptr); |
||||
std::vector<uint32_t> bucket_count(hash_table_size_, 0); |
||||
size_t sub_index_size_needed = BucketizeIndexesAndFillBloom( |
||||
record_list, num_prefixes, &hash_to_offsets, &bucket_count); |
||||
// From the temp data structure, populate indexes.
|
||||
FillIndexes(sub_index_size_needed, hash_to_offsets, bucket_count); |
||||
|
||||
return Status::OK(); |
||||
} |
||||
|
||||
Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, |
||||
uint32_t prefix_hash, bool& prefix_matched, |
||||
uint32_t& ret_offset) { |
||||
prefix_matched = false; |
||||
int bucket = GetBucketIdFromHash(prefix_hash, hash_table_size_); |
||||
uint32_t bucket_value = hash_table_[bucket]; |
||||
if (bucket_value == data_end_offset_) { |
||||
ret_offset = data_end_offset_; |
||||
return Status::OK(); |
||||
} else if ((bucket_value & kSubIndexMask) == 0) { |
||||
// point directly to the file
|
||||
ret_offset = bucket_value; |
||||
return Status::OK(); |
||||
} |
||||
|
||||
// point to sub-index, need to do a binary search
|
||||
uint32_t low = 0; |
||||
uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask; |
||||
|
||||
const char* index_ptr = sub_index_ + prefix_index_offset; |
||||
uint32_t upper_bound = 0; |
||||
const uint32_t* base_ptr = (const uint32_t*) GetVarint32Ptr(index_ptr, |
||||
index_ptr + 4, |
||||
&upper_bound); |
||||
uint32_t high = upper_bound; |
||||
ParsedInternalKey mid_key; |
||||
ParsedInternalKey parsed_target; |
||||
if (!ParseInternalKey(target, &parsed_target)) { |
||||
return Status::Corruption(Slice()); |
||||
} |
||||
|
||||
// The key is between [low, high). Do a binary search between it.
|
||||
while (high - low > 1) { |
||||
uint32_t mid = (high + low) / 2; |
||||
uint32_t file_offset = base_ptr[mid]; |
||||
size_t tmp; |
||||
Status s = ReadKey(file_data_.data() + file_offset, &mid_key, tmp); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
int cmp_result = internal_comparator_.Compare(mid_key, parsed_target); |
||||
if (cmp_result < 0) { |
||||
low = mid; |
||||
} else { |
||||
if (cmp_result == 0) { |
||||
// Happen to have found the exact key or target is smaller than the
|
||||
// first key after base_offset.
|
||||
prefix_matched = true; |
||||
ret_offset = file_offset; |
||||
return Status::OK(); |
||||
} else { |
||||
high = mid; |
||||
} |
||||
} |
||||
} |
||||
// Both of the key at the position low or low+1 could share the same
|
||||
// prefix as target. We need to rule out one of them to avoid to go
|
||||
// to the wrong prefix.
|
||||
ParsedInternalKey low_key; |
||||
size_t tmp; |
||||
uint32_t low_key_offset = base_ptr[low]; |
||||
Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, tmp); |
||||
if (GetPrefix(low_key) == prefix) { |
||||
prefix_matched = true; |
||||
ret_offset = low_key_offset; |
||||
} else if (low + 1 < upper_bound) { |
||||
// There is possible a next prefix, return it
|
||||
prefix_matched = false; |
||||
ret_offset = base_ptr[low + 1]; |
||||
} else { |
||||
// target is larger than a key of the last prefix in this bucket
|
||||
// but with a different prefix. Key does not exist.
|
||||
ret_offset = data_end_offset_; |
||||
} |
||||
return Status::OK(); |
||||
} |
||||
|
||||
bool PlainTableReader::MayHavePrefix(uint32_t hash) { |
||||
return bloom_ == nullptr || bloom_->MayContainHash(hash); |
||||
} |
||||
|
||||
Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) { |
||||
return options_.prefix_extractor->Transform(target.user_key); |
||||
} |
||||
|
||||
Status PlainTableReader::ReadKey(const char* row_ptr, ParsedInternalKey* key, |
||||
size_t& bytes_read) { |
||||
const char* key_ptr = nullptr; |
||||
bytes_read = 0; |
||||
size_t user_key_size = 0; |
||||
if (IsFixedLength()) { |
||||
user_key_size = user_key_len_; |
||||
key_ptr = row_ptr; |
||||
} else { |
||||
uint32_t tmp_size = 0; |
||||
key_ptr = GetVarint32Ptr(row_ptr, file_data_.data() + data_end_offset_, |
||||
&tmp_size); |
||||
if (key_ptr == nullptr) { |
||||
return Status::Corruption("Unable to read the next key"); |
||||
} |
||||
user_key_size = (size_t)tmp_size; |
||||
bytes_read = key_ptr - row_ptr; |
||||
} |
||||
if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) { |
||||
return Status::Corruption("Unable to read the next key"); |
||||
} |
||||
|
||||
if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) { |
||||
// Special encoding for the row with seqID=0
|
||||
key->user_key = Slice(key_ptr, user_key_size); |
||||
key->sequence = 0; |
||||
key->type = kTypeValue; |
||||
bytes_read += user_key_size + 1; |
||||
} else { |
||||
if (row_ptr + user_key_size + 8 >= file_data_.data() + data_end_offset_) { |
||||
return Status::Corruption("Unable to read the next key"); |
||||
} |
||||
if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) { |
||||
return Status::Corruption(Slice()); |
||||
} |
||||
bytes_read += user_key_size + 8; |
||||
} |
||||
|
||||
return Status::OK(); |
||||
} |
||||
|
||||
Status PlainTableReader::Next(uint32_t offset, ParsedInternalKey* key, |
||||
Slice* value, uint32_t& next_offset) { |
||||
if (offset == data_end_offset_) { |
||||
next_offset = data_end_offset_; |
||||
return Status::OK(); |
||||
} |
||||
|
||||
if (offset > data_end_offset_) { |
||||
return Status::Corruption("Offset is out of file size"); |
||||
} |
||||
|
||||
const char* row_ptr = file_data_.data() + offset; |
||||
size_t bytes_for_key; |
||||
Status s = ReadKey(row_ptr, key, bytes_for_key); |
||||
uint32_t value_size; |
||||
const char* value_ptr = GetVarint32Ptr(row_ptr + bytes_for_key, |
||||
file_data_.data() + data_end_offset_, |
||||
&value_size); |
||||
if (value_ptr == nullptr) { |
||||
return Status::Corruption("Error reading value length."); |
||||
} |
||||
next_offset = offset + (value_ptr - row_ptr) + value_size; |
||||
if (next_offset > data_end_offset_) { |
||||
return Status::Corruption("Reach end of file when reading value"); |
||||
} |
||||
*value = Slice(value_ptr, value_size); |
||||
|
||||
return Status::OK(); |
||||
} |
||||
|
||||
Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, |
||||
void* arg, |
||||
bool (*saver)(void*, const ParsedInternalKey&, |
||||
const Slice&, bool), |
||||
void (*mark_key_may_exist)(void*)) { |
||||
// Check bloom filter first.
|
||||
Slice prefix_slice = GetPrefix(target); |
||||
uint32_t prefix_hash = GetSliceHash(prefix_slice); |
||||
if (!MayHavePrefix(prefix_hash)) { |
||||
return Status::OK(); |
||||
} |
||||
uint32_t offset; |
||||
bool prefix_match; |
||||
Status s = GetOffset(target, prefix_slice, prefix_hash, prefix_match, offset); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
ParsedInternalKey found_key; |
||||
ParsedInternalKey parsed_target; |
||||
if (!ParseInternalKey(target, &parsed_target)) { |
||||
return Status::Corruption(Slice()); |
||||
} |
||||
|
||||
Slice found_value; |
||||
while (offset < data_end_offset_) { |
||||
Status s = Next(offset, &found_key, &found_value, offset); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
if (!prefix_match) { |
||||
// Need to verify prefix for the first key found if it is not yet
|
||||
// checked.
|
||||
if (GetPrefix(found_key) != prefix_slice) { |
||||
return Status::OK(); |
||||
} |
||||
prefix_match = true; |
||||
} |
||||
if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { |
||||
if (!(*saver)(arg, found_key, found_value, true)) { |
||||
break; |
||||
} |
||||
} |
||||
} |
||||
return Status::OK(); |
||||
} |
||||
|
||||
uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) { |
||||
return 0; |
||||
} |
||||
|
||||
PlainTableIterator::PlainTableIterator(PlainTableReader* table) : |
||||
table_(table) { |
||||
next_offset_ = offset_ = table_->data_end_offset_; |
||||
} |
||||
|
||||
PlainTableIterator::~PlainTableIterator() { |
||||
} |
||||
|
||||
bool PlainTableIterator::Valid() const { |
||||
return offset_ < table_->data_end_offset_ |
||||
&& offset_ >= table_->data_start_offset_; |
||||
} |
||||
|
||||
void PlainTableIterator::SeekToFirst() { |
||||
next_offset_ = table_->data_start_offset_; |
||||
if (next_offset_ >= table_->data_end_offset_) { |
||||
next_offset_ = offset_ = table_->data_end_offset_; |
||||
} else { |
||||
Next(); |
||||
} |
||||
} |
||||
|
||||
void PlainTableIterator::SeekToLast() { |
||||
assert(false); |
||||
} |
||||
|
||||
void PlainTableIterator::Seek(const Slice& target) { |
||||
Slice prefix_slice = table_->GetPrefix(target); |
||||
uint32_t prefix_hash = GetSliceHash(prefix_slice); |
||||
if (!table_->MayHavePrefix(prefix_hash)) { |
||||
offset_ = next_offset_ = table_->data_end_offset_; |
||||
return; |
||||
} |
||||
bool prefix_match; |
||||
status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match, |
||||
next_offset_); |
||||
if (!status_.ok()) { |
||||
offset_ = next_offset_ = table_->data_end_offset_; |
||||
return; |
||||
} |
||||
|
||||
if (next_offset_ < table_-> data_end_offset_) { |
||||
for (Next(); status_.ok() && Valid(); Next()) { |
||||
if (!prefix_match) { |
||||
// Need to verify the first key's prefix
|
||||
if (table_->GetPrefix(key()) != prefix_slice) { |
||||
offset_ = next_offset_ = table_->data_end_offset_; |
||||
break; |
||||
} |
||||
prefix_match = true; |
||||
} |
||||
if (table_->internal_comparator_.Compare(key(), target) >= 0) { |
||||
break; |
||||
} |
||||
} |
||||
} else { |
||||
offset_ = table_->data_end_offset_; |
||||
} |
||||
} |
||||
|
||||
void PlainTableIterator::Next() { |
||||
offset_ = next_offset_; |
||||
if (offset_ < table_->data_end_offset_) { |
||||
Slice tmp_slice; |
||||
ParsedInternalKey parsed_key; |
||||
status_ = table_->Next(next_offset_, &parsed_key, &value_, next_offset_); |
||||
if (status_.ok()) { |
||||
// Make a copy in this case. TODO optimize.
|
||||
tmp_str_.clear(); |
||||
AppendInternalKey(&tmp_str_, parsed_key); |
||||
key_ = Slice(tmp_str_); |
||||
} else { |
||||
offset_ = next_offset_ = table_->data_end_offset_; |
||||
} |
||||
} |
||||
} |
||||
|
||||
void PlainTableIterator::Prev() { |
||||
assert(false); |
||||
} |
||||
|
||||
Slice PlainTableIterator::key() const { |
||||
assert(Valid()); |
||||
return key_; |
||||
} |
||||
|
||||
Slice PlainTableIterator::value() const { |
||||
assert(Valid()); |
||||
return value_; |
||||
} |
||||
|
||||
Status PlainTableIterator::status() const { |
||||
return status_; |
||||
} |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,220 @@ |
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once |
||||
#include <unordered_map> |
||||
#include <memory> |
||||
#include <vector> |
||||
#include <string> |
||||
#include <stdint.h> |
||||
|
||||
#include "db/dbformat.h" |
||||
#include "rocksdb/env.h" |
||||
#include "rocksdb/iterator.h" |
||||
#include "rocksdb/slice_transform.h" |
||||
#include "rocksdb/table.h" |
||||
#include "rocksdb/table_properties.h" |
||||
#include "table/table_reader.h" |
||||
#include "table/plain_table_factory.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class Block; |
||||
class BlockHandle; |
||||
class Footer; |
||||
struct Options; |
||||
class RandomAccessFile; |
||||
struct ReadOptions; |
||||
class TableCache; |
||||
class TableReader; |
||||
class DynamicBloom; |
||||
class InternalKeyComparator; |
||||
|
||||
using std::unique_ptr; |
||||
using std::unordered_map; |
||||
extern const uint32_t kPlainTableVariableLength; |
||||
|
||||
// Based on following output file format shown in plain_table_factory.h
|
||||
// When opening the output file, IndexedTableReader creates a hash table
|
||||
// from key prefixes to offset of the output file. IndexedTable will decide
|
||||
// whether it points to the data offset of the first key with the key prefix
|
||||
// or the offset of it. If there are too many keys share this prefix, it will
|
||||
// create a binary search-able index from the suffix to offset on disk.
|
||||
//
|
||||
// The implementation of IndexedTableReader requires output file is mmaped
|
||||
class PlainTableReader: public TableReader { |
||||
public: |
||||
static Status Open(const Options& options, const EnvOptions& soptions, |
||||
const InternalKeyComparator& internal_comparator, |
||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size, |
||||
unique_ptr<TableReader>* table, |
||||
const int bloom_bits_per_key, double hash_table_ratio); |
||||
|
||||
bool PrefixMayMatch(const Slice& internal_prefix); |
||||
|
||||
Iterator* NewIterator(const ReadOptions&); |
||||
|
||||
Status Get(const ReadOptions&, const Slice& key, void* arg, |
||||
bool (*result_handler)(void* arg, const ParsedInternalKey& k, |
||||
const Slice& v, bool), |
||||
void (*mark_key_may_exist)(void*) = nullptr); |
||||
|
||||
uint64_t ApproximateOffsetOf(const Slice& key); |
||||
|
||||
void SetupForCompaction(); |
||||
|
||||
const TableProperties& GetTableProperties() { return table_properties_; } |
||||
|
||||
PlainTableReader(const EnvOptions& storage_options, |
||||
const InternalKeyComparator& internal_comparator, |
||||
uint64_t file_size, int bloom_num_bits, |
||||
double hash_table_ratio, |
||||
const TableProperties& table_properties); |
||||
~PlainTableReader(); |
||||
|
||||
private: |
||||
struct IndexRecord; |
||||
class IndexRecordList; |
||||
|
||||
uint32_t* hash_table_ = nullptr; |
||||
int hash_table_size_ = 0; |
||||
char* sub_index_ = nullptr; |
||||
|
||||
Options options_; |
||||
const EnvOptions& soptions_; |
||||
const InternalKeyComparator internal_comparator_; |
||||
Status status_; |
||||
unique_ptr<RandomAccessFile> file_; |
||||
|
||||
Slice file_data_; |
||||
uint32_t version_; |
||||
uint32_t file_size_; |
||||
|
||||
const double kHashTableRatio; |
||||
const int kBloomBitsPerKey; |
||||
DynamicBloom* bloom_ = nullptr; |
||||
|
||||
TableProperties table_properties_; |
||||
const uint32_t data_start_offset_ = 0; |
||||
const uint32_t data_end_offset_; |
||||
const size_t user_key_len_; |
||||
|
||||
static const size_t kNumInternalBytes = 8; |
||||
static const uint32_t kSubIndexMask = 0x80000000; |
||||
static const size_t kOffsetLen = sizeof(uint32_t); |
||||
static const uint64_t kMaxFileSize = 1u << 31; |
||||
static const size_t kRecordsPerGroup = 256; |
||||
// To speed up the search for keys with same prefix, we'll add index key for
|
||||
// every N keys, where the "N" is determined by
|
||||
// kIndexIntervalForSamePrefixKeys
|
||||
static const size_t kIndexIntervalForSamePrefixKeys = 16; |
||||
|
||||
bool IsFixedLength() const { |
||||
return user_key_len_ != kPlainTableVariableLength; |
||||
} |
||||
|
||||
size_t GetFixedInternalKeyLength() const { |
||||
return user_key_len_ + kNumInternalBytes; |
||||
} |
||||
|
||||
friend class TableCache; |
||||
friend class PlainTableIterator; |
||||
|
||||
// Internal helper function to generate an IndexRecordList object from all
|
||||
// the rows, which contains index records as a list.
|
||||
int PopulateIndexRecordList(IndexRecordList* record_list); |
||||
|
||||
// Internal helper function to allocate memory for indexes and bloom filters
|
||||
void AllocateIndexAndBloom(int num_prefixes); |
||||
|
||||
// Internal helper function to bucket index record list to hash buckets.
|
||||
// hash_to_offsets is sized of of hash_table_size_, each contains a linked
|
||||
// list
|
||||
// of offsets for the hash, in reversed order.
|
||||
// bucket_count is sized of hash_table_size_. The value is how many index
|
||||
// records are there in hash_to_offsets for the same bucket.
|
||||
size_t BucketizeIndexesAndFillBloom( |
||||
IndexRecordList& record_list, int num_prefixes, |
||||
std::vector<IndexRecord*>* hash_to_offsets, |
||||
std::vector<uint32_t>* bucket_count); |
||||
|
||||
// Internal helper class to fill the indexes and bloom filters to internal
|
||||
// data structures. hash_to_offsets and bucket_count are bucketized indexes
|
||||
// and counts generated by BucketizeIndexesAndFillBloom().
|
||||
void FillIndexes(size_t sub_index_size_needed, |
||||
const std::vector<IndexRecord*>& hash_to_offsets, |
||||
const std::vector<uint32_t>& bucket_count); |
||||
|
||||
// PopulateIndex() builds index of keys. It must be called before any query
|
||||
// to the table.
|
||||
//
|
||||
// hash_table_ contains buckets size of hash_table_size_, each is a 32-bit
|
||||
// integer. The lower 31 bits contain an offset value (explained below) and
|
||||
// the first bit of the integer indicates type of the offset.
|
||||
//
|
||||
// +--------------+------------------------------------------------------+
|
||||
// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
|
||||
// +--------------+------------------------------------------------------+
|
||||
//
|
||||
// Explanation for the "flag bit":
|
||||
//
|
||||
// 0 indicates that the bucket contains only one prefix (no conflict when
|
||||
// hashing this prefix), whose first row starts from this offset of the
|
||||
// file.
|
||||
// 1 indicates that the bucket contains more than one prefixes, or there
|
||||
// are too many rows for one prefix so we need a binary search for it. In
|
||||
// this case, the offset indicates the offset of sub_index_ holding the
|
||||
// binary search indexes of keys for those rows. Those binary search indexes
|
||||
// are organized in this way:
|
||||
//
|
||||
// The first 4 bytes, indicate how many indexes (N) are stored after it. After
|
||||
// it, there are N 32-bit integers, each points of an offset of the file,
|
||||
// which
|
||||
// points to starting of a row. Those offsets need to be guaranteed to be in
|
||||
// ascending order so the keys they are pointing to are also in ascending
|
||||
// order
|
||||
// to make sure we can use them to do binary searches. Below is visual
|
||||
// presentation of a bucket.
|
||||
//
|
||||
// <begin>
|
||||
// number_of_records: varint32
|
||||
// record 1 file offset: fixedint32
|
||||
// record 2 file offset: fixedint32
|
||||
// ....
|
||||
// record N file offset: fixedint32
|
||||
// <end>
|
||||
Status PopulateIndex(); |
||||
|
||||
// Check bloom filter to see whether it might contain this prefix.
|
||||
// The hash of the prefix is given, since it can be reused for index lookup
|
||||
// too.
|
||||
bool MayHavePrefix(uint32_t hash); |
||||
|
||||
Status ReadKey(const char* row_ptr, ParsedInternalKey* key, |
||||
size_t& bytes_read); |
||||
// Read the key and value at offset to key and value.
|
||||
// tmp_slice is a tmp slice.
|
||||
// return next_offset as the offset for the next key.
|
||||
Status Next(uint32_t offset, ParsedInternalKey* key, Slice* value, |
||||
uint32_t& next_offset); |
||||
// Get file offset for key target.
|
||||
// return value prefix_matched is set to true if the offset is confirmed
|
||||
// for a key with the same prefix as target.
|
||||
Status GetOffset(const Slice& target, const Slice& prefix, |
||||
uint32_t prefix_hash, bool& prefix_matched, |
||||
uint32_t& ret_offset); |
||||
|
||||
Slice GetPrefix(const Slice& target) { |
||||
assert(target.size() >= 8); // target is internal key
|
||||
return options_.prefix_extractor->Transform( |
||||
Slice(target.data(), target.size() - 8)); |
||||
} |
||||
|
||||
Slice GetPrefix(const ParsedInternalKey& target); |
||||
|
||||
// No copying allowed
|
||||
explicit PlainTableReader(const TableReader&) = delete; |
||||
void operator=(const TableReader&) = delete; |
||||
}; |
||||
} // namespace rocksdb
|
@ -0,0 +1,55 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class Slice; |
||||
class Status; |
||||
|
||||
// TableBuilder provides the interface used to build a Table
|
||||
// (an immutable and sorted map from keys to values).
|
||||
//
|
||||
// Multiple threads can invoke const methods on a TableBuilder without
|
||||
// external synchronization, but if any of the threads may call a
|
||||
// non-const method, all threads accessing the same TableBuilder must use
|
||||
// external synchronization.
|
||||
class TableBuilder { |
||||
public: |
||||
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||
virtual ~TableBuilder() {} |
||||
|
||||
// Add key,value to the table being constructed.
|
||||
// REQUIRES: key is after any previously added key according to comparator.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
virtual void Add(const Slice& key, const Slice& value) = 0; |
||||
|
||||
// Return non-ok iff some error has been detected.
|
||||
virtual Status status() const = 0; |
||||
|
||||
// Finish building the table.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
virtual Status Finish() = 0; |
||||
|
||||
// Indicate that the contents of this builder should be abandoned.
|
||||
// If the caller is not going to call Finish(), it must call Abandon()
|
||||
// before destroying this builder.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
virtual void Abandon() = 0; |
||||
|
||||
// Number of calls to Add() so far.
|
||||
virtual uint64_t NumEntries() const = 0; |
||||
|
||||
// Size of the file generated so far. If invoked after a successful
|
||||
// Finish() call, returns the size of the final generated file.
|
||||
virtual uint64_t FileSize() const = 0; |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,114 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "rocksdb/table_properties.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
namespace { |
||||
void AppendProperty( |
||||
std::string& props, |
||||
const std::string& key, |
||||
const std::string& value, |
||||
const std::string& prop_delim, |
||||
const std::string& kv_delim) { |
||||
props.append(key); |
||||
props.append(kv_delim); |
||||
props.append(value); |
||||
props.append(prop_delim); |
||||
} |
||||
|
||||
template <class TValue> |
||||
void AppendProperty( |
||||
std::string& props, |
||||
const std::string& key, |
||||
const TValue& value, |
||||
const std::string& prop_delim, |
||||
const std::string& kv_delim) { |
||||
AppendProperty( |
||||
props, key, std::to_string(value), prop_delim, kv_delim |
||||
); |
||||
} |
||||
} |
||||
|
||||
std::string TableProperties::ToString( |
||||
const std::string& prop_delim, |
||||
const std::string& kv_delim) const { |
||||
std::string result; |
||||
result.reserve(1024); |
||||
|
||||
// Basic Info
|
||||
AppendProperty( |
||||
result, "# data blocks", num_data_blocks, prop_delim, kv_delim |
||||
); |
||||
AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim); |
||||
|
||||
AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim); |
||||
AppendProperty( |
||||
result, |
||||
"raw average key size", |
||||
num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0, |
||||
prop_delim, |
||||
kv_delim |
||||
); |
||||
AppendProperty( |
||||
result, "raw value size", raw_value_size, prop_delim, kv_delim |
||||
); |
||||
AppendProperty( |
||||
result, |
||||
"raw average value size", |
||||
num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0, |
||||
prop_delim, |
||||
kv_delim |
||||
); |
||||
|
||||
AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); |
||||
AppendProperty(result, "index block size", index_size, prop_delim, kv_delim); |
||||
AppendProperty( |
||||
result, "filter block size", filter_size, prop_delim, kv_delim |
||||
); |
||||
AppendProperty( |
||||
result, |
||||
"(estimated) table size", |
||||
data_size + index_size + filter_size, |
||||
prop_delim, |
||||
kv_delim |
||||
); |
||||
|
||||
AppendProperty( |
||||
result, |
||||
"filter policy name", |
||||
filter_policy_name.empty() ? std::string("N/A") : filter_policy_name, |
||||
prop_delim, |
||||
kv_delim |
||||
); |
||||
|
||||
return result; |
||||
} |
||||
|
||||
const std::string TablePropertiesNames::kDataSize = |
||||
"rocksdb.data.size"; |
||||
const std::string TablePropertiesNames::kIndexSize = |
||||
"rocksdb.index.size"; |
||||
const std::string TablePropertiesNames::kFilterSize = |
||||
"rocksdb.filter.size"; |
||||
const std::string TablePropertiesNames::kRawKeySize = |
||||
"rocksdb.raw.key.size"; |
||||
const std::string TablePropertiesNames::kRawValueSize = |
||||
"rocksdb.raw.value.size"; |
||||
const std::string TablePropertiesNames::kNumDataBlocks = |
||||
"rocksdb.num.data.blocks"; |
||||
const std::string TablePropertiesNames::kNumEntries = |
||||
"rocksdb.num.entries"; |
||||
const std::string TablePropertiesNames::kFilterPolicy = |
||||
"rocksdb.filter.policy"; |
||||
const std::string TablePropertiesNames::kFormatVersion = |
||||
"rocksdb.format.version"; |
||||
const std::string TablePropertiesNames::kFixedKeyLen = |
||||
"rocksdb.fixed.key.length"; |
||||
|
||||
extern const std::string kPropertiesBlock = "rocksdb.properties"; |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,71 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class Iterator; |
||||
struct ParsedInternalKey; |
||||
class Slice; |
||||
struct ReadOptions; |
||||
struct TableProperties; |
||||
|
||||
// A Table is a sorted map from strings to strings. Tables are
|
||||
// immutable and persistent. A Table may be safely accessed from
|
||||
// multiple threads without external synchronization.
|
||||
class TableReader { |
||||
public: |
||||
virtual ~TableReader() {} |
||||
|
||||
// Determine whether there is a chance that the current table file
|
||||
// contains the key a key starting with iternal_prefix. The specific
|
||||
// table implementation can use bloom filter and/or other heuristic
|
||||
// to filter out this table as a whole.
|
||||
virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0; |
||||
|
||||
// Returns a new iterator over the table contents.
|
||||
// The result of NewIterator() is initially invalid (caller must
|
||||
// call one of the Seek methods on the iterator before using it).
|
||||
virtual Iterator* NewIterator(const ReadOptions&) = 0; |
||||
|
||||
// Given a key, return an approximate byte offset in the file where
|
||||
// the data for that key begins (or would begin if the key were
|
||||
// present in the file). The returned value is in terms of file
|
||||
// bytes, and so includes effects like compression of the underlying data.
|
||||
// E.g., the approximate offset of the last key in the table will
|
||||
// be close to the file length.
|
||||
virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0; |
||||
|
||||
// Set up the table for Compaction. Might change some parameters with
|
||||
// posix_fadvise
|
||||
virtual void SetupForCompaction() = 0; |
||||
|
||||
virtual const TableProperties& GetTableProperties() = 0; |
||||
|
||||
// Calls (*result_handler)(handle_context, ...) repeatedly, starting with
|
||||
// the entry found after a call to Seek(key), until result_handler returns
|
||||
// false, where k is the actual internal key for a row found and v as the
|
||||
// value of the key. didIO is true if I/O is involved in the operation. May
|
||||
// not make such a call if filter policy says that key is not present.
|
||||
//
|
||||
// mark_key_may_exist_handler needs to be called when it is configured to be
|
||||
// memory only and the key is not found in the block cache, with
|
||||
// the parameter to be handle_context.
|
||||
//
|
||||
// readOptions is the options for the read
|
||||
// key is the key to search for
|
||||
virtual Status Get( |
||||
const ReadOptions& readOptions, const Slice& key, void* handle_context, |
||||
bool (*result_handler)(void* arg, const ParsedInternalKey& k, |
||||
const Slice& v, bool didIO), |
||||
void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0; |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,36 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "dynamic_bloom.h" |
||||
|
||||
#include "rocksdb/slice.h" |
||||
#include "util/hash.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
namespace { |
||||
static uint32_t BloomHash(const Slice& key) { |
||||
return Hash(key.data(), key.size(), 0xbc9f1d34); |
||||
} |
||||
} |
||||
|
||||
DynamicBloom::DynamicBloom(uint32_t total_bits, |
||||
uint32_t (*hash_func)(const Slice& key), |
||||
uint32_t num_probes) |
||||
: hash_func_(hash_func), |
||||
kTotalBits((total_bits + 7) / 8 * 8), |
||||
kNumProbes(num_probes) { |
||||
assert(hash_func_); |
||||
assert(kNumProbes > 0); |
||||
assert(kTotalBits > 0); |
||||
data_.reset(new unsigned char[kTotalBits / 8]()); |
||||
} |
||||
|
||||
DynamicBloom::DynamicBloom(uint32_t total_bits, |
||||
uint32_t num_probes) |
||||
: DynamicBloom(total_bits, &BloomHash, num_probes) { |
||||
} |
||||
|
||||
} // rocksdb
|
@ -0,0 +1,72 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once |
||||
|
||||
#include <atomic> |
||||
#include <memory> |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class Slice; |
||||
|
||||
class DynamicBloom { |
||||
public: |
||||
// total_bits: fixed total bits for the bloom
|
||||
// hash_func: customized hash function
|
||||
// num_probes: number of hash probes for a single key
|
||||
DynamicBloom(uint32_t total_bits, |
||||
uint32_t (*hash_func)(const Slice& key), |
||||
uint32_t num_probes = 6); |
||||
|
||||
explicit DynamicBloom(uint32_t total_bits, uint32_t num_probes = 6); |
||||
|
||||
// Assuming single threaded access to this function.
|
||||
void Add(const Slice& key); |
||||
|
||||
// Assuming single threaded access to this function.
|
||||
void AddHash(uint32_t hash); |
||||
|
||||
// Multithreaded access to this function is OK
|
||||
bool MayContain(const Slice& key); |
||||
|
||||
// Multithreaded access to this function is OK
|
||||
bool MayContainHash(uint32_t hash); |
||||
|
||||
private: |
||||
uint32_t (*hash_func_)(const Slice& key); |
||||
const uint32_t kTotalBits; |
||||
const uint32_t kNumProbes; |
||||
std::unique_ptr<unsigned char[]> data_; |
||||
}; |
||||
|
||||
inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); } |
||||
|
||||
inline bool DynamicBloom::MayContain(const Slice& key) { |
||||
return (MayContainHash(hash_func_(key))); |
||||
} |
||||
|
||||
inline bool DynamicBloom::MayContainHash(uint32_t h) { |
||||
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
||||
for (uint32_t i = 0; i < kNumProbes; i++) { |
||||
const uint32_t bitpos = h % kTotalBits; |
||||
if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { |
||||
return false; |
||||
} |
||||
h += delta; |
||||
} |
||||
return true; |
||||
} |
||||
|
||||
inline void DynamicBloom::AddHash(uint32_t h) { |
||||
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
||||
for (uint32_t i = 0; i < kNumProbes; i++) { |
||||
const uint32_t bitpos = h % kTotalBits; |
||||
data_[bitpos / 8] |= (1 << (bitpos % 8)); |
||||
h += delta; |
||||
} |
||||
} |
||||
|
||||
} // rocksdb
|
@ -0,0 +1,113 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include <gflags/gflags.h> |
||||
|
||||
#include "dynamic_bloom.h" |
||||
#include "util/logging.h" |
||||
#include "util/testharness.h" |
||||
#include "util/testutil.h" |
||||
|
||||
DEFINE_int32(bits_per_key, 10, ""); |
||||
DEFINE_int32(num_probes, 6, ""); |
||||
|
||||
namespace rocksdb { |
||||
|
||||
static Slice Key(int i, char* buffer) { |
||||
memcpy(buffer, &i, sizeof(i)); |
||||
return Slice(buffer, sizeof(i)); |
||||
} |
||||
|
||||
class DynamicBloomTest { |
||||
}; |
||||
|
||||
TEST(DynamicBloomTest, EmptyFilter) { |
||||
DynamicBloom bloom(100, 2); |
||||
ASSERT_TRUE(! bloom.MayContain("hello")); |
||||
ASSERT_TRUE(! bloom.MayContain("world")); |
||||
} |
||||
|
||||
TEST(DynamicBloomTest, Small) { |
||||
DynamicBloom bloom(100, 2); |
||||
bloom.Add("hello"); |
||||
bloom.Add("world"); |
||||
ASSERT_TRUE(bloom.MayContain("hello")); |
||||
ASSERT_TRUE(bloom.MayContain("world")); |
||||
ASSERT_TRUE(! bloom.MayContain("x")); |
||||
ASSERT_TRUE(! bloom.MayContain("foo")); |
||||
} |
||||
|
||||
static int NextLength(int length) { |
||||
if (length < 10) { |
||||
length += 1; |
||||
} else if (length < 100) { |
||||
length += 10; |
||||
} else if (length < 1000) { |
||||
length += 100; |
||||
} else { |
||||
length += 1000; |
||||
} |
||||
return length; |
||||
} |
||||
|
||||
TEST(DynamicBloomTest, VaryingLengths) { |
||||
char buffer[sizeof(int)]; |
||||
|
||||
// Count number of filters that significantly exceed the false positive rate
|
||||
int mediocre_filters = 0; |
||||
int good_filters = 0; |
||||
|
||||
fprintf(stderr, "bits_per_key: %d num_probes: %d\n", |
||||
FLAGS_bits_per_key, FLAGS_num_probes); |
||||
|
||||
for (int length = 1; length <= 10000; length = NextLength(length)) { |
||||
uint32_t bloom_bits = std::max(length * FLAGS_bits_per_key, 64); |
||||
DynamicBloom bloom(bloom_bits, FLAGS_num_probes); |
||||
for (int i = 0; i < length; i++) { |
||||
bloom.Add(Key(i, buffer)); |
||||
ASSERT_TRUE(bloom.MayContain(Key(i, buffer))); |
||||
} |
||||
|
||||
// All added keys must match
|
||||
for (int i = 0; i < length; i++) { |
||||
ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) |
||||
<< "Length " << length << "; key " << i; |
||||
} |
||||
|
||||
// Check false positive rate
|
||||
|
||||
int result = 0; |
||||
for (int i = 0; i < 10000; i++) { |
||||
if (bloom.MayContain(Key(i + 1000000000, buffer))) { |
||||
result++; |
||||
} |
||||
} |
||||
double rate = result / 10000.0; |
||||
|
||||
fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; \n", |
||||
rate*100.0, length); |
||||
|
||||
//ASSERT_LE(rate, 0.02); // Must not be over 2%
|
||||
if (rate > 0.0125) |
||||
mediocre_filters++; // Allowed, but not too often
|
||||
else |
||||
good_filters++; |
||||
} |
||||
|
||||
fprintf(stderr, "Filters: %d good, %d mediocre\n", |
||||
good_filters, mediocre_filters); |
||||
|
||||
ASSERT_LE(mediocre_filters, good_filters/5); |
||||
} |
||||
|
||||
// Different bits-per-byte
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) { |
||||
google::ParseCommandLineFlags(&argc, &argv, true); |
||||
|
||||
return rocksdb::test::RunAllTests(); |
||||
} |
@ -0,0 +1,470 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
|
||||
#include "util/hash_linklist_rep.h" |
||||
|
||||
#include "rocksdb/memtablerep.h" |
||||
#include "util/arena.h" |
||||
#include "rocksdb/slice.h" |
||||
#include "rocksdb/slice_transform.h" |
||||
#include "port/port.h" |
||||
#include "port/atomic_pointer.h" |
||||
#include "util/murmurhash.h" |
||||
#include "db/memtable.h" |
||||
#include "db/skiplist.h" |
||||
|
||||
namespace rocksdb { |
||||
namespace { |
||||
|
||||
typedef const char* Key; |
||||
|
||||
struct Node { |
||||
explicit Node(const Key& k) : |
||||
key(k) { |
||||
} |
||||
|
||||
Key const key; |
||||
|
||||
// Accessors/mutators for links. Wrapped in methods so we can
|
||||
// add the appropriate barriers as necessary.
|
||||
Node* Next() { |
||||
// Use an 'acquire load' so that we observe a fully initialized
|
||||
// version of the returned Node.
|
||||
return reinterpret_cast<Node*>(next_.Acquire_Load()); |
||||
} |
||||
void SetNext(Node* x) { |
||||
// Use a 'release store' so that anybody who reads through this
|
||||
// pointer observes a fully initialized version of the inserted node.
|
||||
next_.Release_Store(x); |
||||
} |
||||
|
||||
// No-barrier variants that can be safely used in a few locations.
|
||||
Node* NoBarrier_Next() { |
||||
return reinterpret_cast<Node*>(next_.NoBarrier_Load()); |
||||
} |
||||
void NoBarrier_SetNext(Node* x) { |
||||
next_.NoBarrier_Store(x); |
||||
} |
||||
|
||||
private: |
||||
port::AtomicPointer next_; |
||||
}; |
||||
|
||||
class HashLinkListRep : public MemTableRep { |
||||
public: |
||||
HashLinkListRep(MemTableRep::KeyComparator& compare, Arena* arena, |
||||
const SliceTransform* transform, size_t bucket_size); |
||||
|
||||
virtual void Insert(const char* key) override; |
||||
|
||||
virtual bool Contains(const char* key) const override; |
||||
|
||||
virtual size_t ApproximateMemoryUsage() override; |
||||
|
||||
virtual ~HashLinkListRep(); |
||||
|
||||
virtual MemTableRep::Iterator* GetIterator() override; |
||||
|
||||
virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override; |
||||
|
||||
virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix) |
||||
override; |
||||
|
||||
virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override; |
||||
|
||||
private: |
||||
friend class DynamicIterator; |
||||
typedef SkipList<const char*, MemTableRep::KeyComparator&> FullList; |
||||
|
||||
size_t bucket_size_; |
||||
|
||||
// Maps slices (which are transformed user keys) to buckets of keys sharing
|
||||
// the same transform.
|
||||
port::AtomicPointer* buckets_; |
||||
|
||||
// The user-supplied transform whose domain is the user keys.
|
||||
const SliceTransform* transform_; |
||||
|
||||
MemTableRep::KeyComparator& compare_; |
||||
// immutable after construction
|
||||
Arena* const arena_; |
||||
|
||||
bool BucketContains(Node* head, const Slice& key) const; |
||||
|
||||
Slice GetPrefix(const Slice& internal_key) const { |
||||
return transform_->Transform(ExtractUserKey(internal_key)); |
||||
} |
||||
|
||||
size_t GetHash(const Slice& slice) const { |
||||
return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_; |
||||
} |
||||
|
||||
Node* GetBucket(size_t i) const { |
||||
return static_cast<Node*>(buckets_[i].Acquire_Load()); |
||||
} |
||||
|
||||
Node* GetBucket(const Slice& slice) const { |
||||
return GetBucket(GetHash(slice)); |
||||
} |
||||
|
||||
Node* NewNode(const Key& key) { |
||||
char* mem = arena_->AllocateAligned(sizeof(Node)); |
||||
return new (mem) Node(key); |
||||
} |
||||
|
||||
bool Equal(const Slice& a, const Key& b) const { |
||||
return (compare_(b, a) == 0); |
||||
} |
||||
|
||||
|
||||
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } |
||||
|
||||
bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const { |
||||
// nullptr n is considered infinite
|
||||
return (n != nullptr) && (compare_(n->key, internal_key) < 0); |
||||
} |
||||
|
||||
bool KeyIsAfterNode(const Key& key, const Node* n) const { |
||||
// nullptr n is considered infinite
|
||||
return (n != nullptr) && (compare_(n->key, key) < 0); |
||||
} |
||||
|
||||
|
||||
Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const; |
||||
|
||||
class FullListIterator : public MemTableRep::Iterator { |
||||
public: |
||||
explicit FullListIterator(FullList* list) |
||||
: iter_(list), full_list_(list) {} |
||||
|
||||
virtual ~FullListIterator() { |
||||
} |
||||
|
||||
// Returns true iff the iterator is positioned at a valid node.
|
||||
virtual bool Valid() const { |
||||
return iter_.Valid(); |
||||
} |
||||
|
||||
// Returns the key at the current position.
|
||||
// REQUIRES: Valid()
|
||||
virtual const char* key() const { |
||||
assert(Valid()); |
||||
return iter_.key(); |
||||
} |
||||
|
||||
// Advances to the next position.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Next() { |
||||
assert(Valid()); |
||||
iter_.Next(); |
||||
} |
||||
|
||||
// Advances to the previous position.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Prev() { |
||||
assert(Valid()); |
||||
iter_.Prev(); |
||||
} |
||||
|
||||
// Advance to the first entry with a key >= target
|
||||
virtual void Seek(const Slice& internal_key, const char* memtable_key) { |
||||
const char* encoded_key = |
||||
(memtable_key != nullptr) ? |
||||
memtable_key : EncodeKey(&tmp_, internal_key); |
||||
iter_.Seek(encoded_key); |
||||
} |
||||
|
||||
// Position at the first entry in collection.
|
||||
// Final state of iterator is Valid() iff collection is not empty.
|
||||
virtual void SeekToFirst() { |
||||
iter_.SeekToFirst(); |
||||
} |
||||
|
||||
// Position at the last entry in collection.
|
||||
// Final state of iterator is Valid() iff collection is not empty.
|
||||
virtual void SeekToLast() { |
||||
iter_.SeekToLast(); |
||||
} |
||||
private: |
||||
FullList::Iterator iter_; |
||||
// To destruct with the iterator.
|
||||
std::unique_ptr<FullList> full_list_; |
||||
std::string tmp_; // For passing to EncodeKey
|
||||
}; |
||||
|
||||
class Iterator : public MemTableRep::Iterator { |
||||
public: |
||||
explicit Iterator(const HashLinkListRep* const hash_link_list_rep, |
||||
Node* head) : |
||||
hash_link_list_rep_(hash_link_list_rep), head_(head), node_(nullptr) { |
||||
} |
||||
|
||||
virtual ~Iterator() { |
||||
} |
||||
|
||||
// Returns true iff the iterator is positioned at a valid node.
|
||||
virtual bool Valid() const { |
||||
return node_ != nullptr; |
||||
} |
||||
|
||||
// Returns the key at the current position.
|
||||
// REQUIRES: Valid()
|
||||
virtual const char* key() const { |
||||
assert(Valid()); |
||||
return node_->key; |
||||
} |
||||
|
||||
// Advances to the next position.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Next() { |
||||
assert(Valid()); |
||||
node_ = node_->Next(); |
||||
} |
||||
|
||||
// Advances to the previous position.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Prev() { |
||||
// Prefix iterator does not support total order.
|
||||
// We simply set the iterator to invalid state
|
||||
Reset(nullptr); |
||||
} |
||||
|
||||
// Advance to the first entry with a key >= target
|
||||
virtual void Seek(const Slice& internal_key, const char* memtable_key) { |
||||
node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_, |
||||
internal_key); |
||||
} |
||||
|
||||
// Position at the first entry in collection.
|
||||
// Final state of iterator is Valid() iff collection is not empty.
|
||||
virtual void SeekToFirst() { |
||||
// Prefix iterator does not support total order.
|
||||
// We simply set the iterator to invalid state
|
||||
Reset(nullptr); |
||||
} |
||||
|
||||
// Position at the last entry in collection.
|
||||
// Final state of iterator is Valid() iff collection is not empty.
|
||||
virtual void SeekToLast() { |
||||
// Prefix iterator does not support total order.
|
||||
// We simply set the iterator to invalid state
|
||||
Reset(nullptr); |
||||
} |
||||
|
||||
protected: |
||||
void Reset(Node* head) { |
||||
head_ = head; |
||||
node_ = nullptr; |
||||
} |
||||
private: |
||||
friend class HashLinkListRep; |
||||
const HashLinkListRep* const hash_link_list_rep_; |
||||
Node* head_; |
||||
Node* node_; |
||||
std::string tmp_; // For passing to EncodeKey
|
||||
|
||||
virtual void SeekToHead() { |
||||
node_ = head_; |
||||
} |
||||
}; |
||||
|
||||
class DynamicIterator : public HashLinkListRep::Iterator { |
||||
public: |
||||
explicit DynamicIterator(HashLinkListRep& memtable_rep) |
||||
: HashLinkListRep::Iterator(&memtable_rep, nullptr), |
||||
memtable_rep_(memtable_rep) {} |
||||
|
||||
// Advance to the first entry with a key >= target
|
||||
virtual void Seek(const Slice& k, const char* memtable_key) { |
||||
auto transformed = memtable_rep_.GetPrefix(k); |
||||
Reset(memtable_rep_.GetBucket(transformed)); |
||||
HashLinkListRep::Iterator::Seek(k, memtable_key); |
||||
} |
||||
|
||||
private: |
||||
// the underlying memtable
|
||||
const HashLinkListRep& memtable_rep_; |
||||
}; |
||||
|
||||
class EmptyIterator : public MemTableRep::Iterator { |
||||
// This is used when there wasn't a bucket. It is cheaper than
|
||||
// instantiating an empty bucket over which to iterate.
|
||||
public: |
||||
EmptyIterator() { } |
||||
virtual bool Valid() const { |
||||
return false; |
||||
} |
||||
virtual const char* key() const { |
||||
assert(false); |
||||
return nullptr; |
||||
} |
||||
virtual void Next() { } |
||||
virtual void Prev() { } |
||||
virtual void Seek(const Slice& user_key, const char* memtable_key) { } |
||||
virtual void SeekToFirst() { } |
||||
virtual void SeekToLast() { } |
||||
private: |
||||
}; |
||||
}; |
||||
|
||||
HashLinkListRep::HashLinkListRep(MemTableRep::KeyComparator& compare, |
||||
Arena* arena, const SliceTransform* transform, |
||||
size_t bucket_size) |
||||
: bucket_size_(bucket_size), |
||||
transform_(transform), |
||||
compare_(compare), |
||||
arena_(arena) { |
||||
char* mem = arena_->AllocateAligned( |
||||
sizeof(port::AtomicPointer) * bucket_size); |
||||
|
||||
buckets_ = new (mem) port::AtomicPointer[bucket_size]; |
||||
|
||||
for (size_t i = 0; i < bucket_size_; ++i) { |
||||
buckets_[i].NoBarrier_Store(nullptr); |
||||
} |
||||
} |
||||
|
||||
HashLinkListRep::~HashLinkListRep() { |
||||
} |
||||
|
||||
void HashLinkListRep::Insert(const char* key) { |
||||
assert(!Contains(key)); |
||||
Slice internal_key = GetLengthPrefixedSlice(key); |
||||
auto transformed = GetPrefix(internal_key); |
||||
auto& bucket = buckets_[GetHash(transformed)]; |
||||
Node* head = static_cast<Node*>(bucket.Acquire_Load()); |
||||
|
||||
if (!head) { |
||||
Node* x = NewNode(key); |
||||
// NoBarrier_SetNext() suffices since we will add a barrier when
|
||||
// we publish a pointer to "x" in prev[i].
|
||||
x->NoBarrier_SetNext(nullptr); |
||||
bucket.Release_Store(static_cast<void*>(x)); |
||||
return; |
||||
} |
||||
|
||||
Node* cur = head; |
||||
Node* prev = nullptr; |
||||
while (true) { |
||||
if (cur == nullptr) { |
||||
break; |
||||
} |
||||
Node* next = cur->Next(); |
||||
// Make sure the lists are sorted.
|
||||
// If x points to head_ or next points nullptr, it is trivially satisfied.
|
||||
assert((cur == head) || (next == nullptr) || |
||||
KeyIsAfterNode(next->key, cur)); |
||||
if (KeyIsAfterNode(internal_key, cur)) { |
||||
// Keep searching in this list
|
||||
prev = cur; |
||||
cur = next; |
||||
} else { |
||||
break; |
||||
} |
||||
} |
||||
|
||||
// Our data structure does not allow duplicate insertion
|
||||
assert(cur == nullptr || !Equal(key, cur->key)); |
||||
|
||||
Node* x = NewNode(key); |
||||
|
||||
// NoBarrier_SetNext() suffices since we will add a barrier when
|
||||
// we publish a pointer to "x" in prev[i].
|
||||
x->NoBarrier_SetNext(cur); |
||||
|
||||
if (prev) { |
||||
prev->SetNext(x); |
||||
} else { |
||||
bucket.Release_Store(static_cast<void*>(x)); |
||||
} |
||||
} |
||||
|
||||
bool HashLinkListRep::Contains(const char* key) const { |
||||
Slice internal_key = GetLengthPrefixedSlice(key); |
||||
|
||||
auto transformed = GetPrefix(internal_key); |
||||
auto bucket = GetBucket(transformed); |
||||
if (bucket == nullptr) { |
||||
return false; |
||||
} |
||||
return BucketContains(bucket, internal_key); |
||||
} |
||||
|
||||
size_t HashLinkListRep::ApproximateMemoryUsage() { |
||||
// Memory is always allocated from the arena.
|
||||
return 0; |
||||
} |
||||
|
||||
MemTableRep::Iterator* HashLinkListRep::GetIterator() { |
||||
auto list = new FullList(compare_, arena_); |
||||
for (size_t i = 0; i < bucket_size_; ++i) { |
||||
auto bucket = GetBucket(i); |
||||
if (bucket != nullptr) { |
||||
Iterator itr(this, bucket); |
||||
for (itr.SeekToHead(); itr.Valid(); itr.Next()) { |
||||
list->Insert(itr.key()); |
||||
} |
||||
} |
||||
} |
||||
return new FullListIterator(list); |
||||
} |
||||
|
||||
MemTableRep::Iterator* HashLinkListRep::GetPrefixIterator( |
||||
const Slice& prefix) { |
||||
auto bucket = GetBucket(prefix); |
||||
if (bucket == nullptr) { |
||||
return new EmptyIterator(); |
||||
} |
||||
return new Iterator(this, bucket); |
||||
} |
||||
|
||||
MemTableRep::Iterator* HashLinkListRep::GetIterator(const Slice& slice) { |
||||
return GetPrefixIterator(transform_->Transform(slice)); |
||||
} |
||||
|
||||
MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator() { |
||||
return new DynamicIterator(*this); |
||||
} |
||||
|
||||
bool HashLinkListRep::BucketContains(Node* head, const Slice& user_key) const { |
||||
Node* x = FindGreaterOrEqualInBucket(head, user_key); |
||||
return (x != nullptr && Equal(user_key, x->key)); |
||||
} |
||||
|
||||
Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head, |
||||
const Slice& key) const { |
||||
Node* x = head; |
||||
while (true) { |
||||
if (x == nullptr) { |
||||
return x; |
||||
} |
||||
Node* next = x->Next(); |
||||
// Make sure the lists are sorted.
|
||||
// If x points to head_ or next points nullptr, it is trivially satisfied.
|
||||
assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x)); |
||||
if (KeyIsAfterNode(key, x)) { |
||||
// Keep searching in this list
|
||||
x = next; |
||||
} else { |
||||
break; |
||||
} |
||||
} |
||||
return x; |
||||
} |
||||
|
||||
} // anon namespace
|
||||
|
||||
MemTableRep* HashLinkListRepFactory::CreateMemTableRep( |
||||
MemTableRep::KeyComparator& compare, Arena* arena) { |
||||
return new HashLinkListRep(compare, arena, transform_, bucket_count_); |
||||
} |
||||
|
||||
MemTableRepFactory* NewHashLinkListRepFactory( |
||||
const SliceTransform* transform, size_t bucket_count) { |
||||
return new HashLinkListRepFactory(transform, bucket_count); |
||||
} |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,39 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once |
||||
#include "rocksdb/slice_transform.h" |
||||
#include "rocksdb/memtablerep.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class HashLinkListRepFactory : public MemTableRepFactory { |
||||
public: |
||||
explicit HashLinkListRepFactory( |
||||
const SliceTransform* transform, |
||||
size_t bucket_count) |
||||
: transform_(transform), |
||||
bucket_count_(bucket_count) { } |
||||
|
||||
virtual ~HashLinkListRepFactory() { delete transform_; } |
||||
|
||||
virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare, |
||||
Arena* arena) override; |
||||
|
||||
virtual const char* Name() const override { |
||||
return "HashLinkListRepFactory"; |
||||
} |
||||
|
||||
const SliceTransform* GetTransform() { return transform_; } |
||||
|
||||
private: |
||||
const SliceTransform* transform_; |
||||
const size_t bucket_count_; |
||||
}; |
||||
|
||||
} |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue