From 5917de0bae64c104cd3311eb8fe8dd1bc812dd4f Mon Sep 17 00:00:00 2001 From: sdong Date: Wed, 21 Jan 2015 11:09:56 -0800 Subject: [PATCH] CappedFixTransform: return fixed length prefix, or full key if key is shorter than the fixed length Summary: Add CappedFixTransform, which is the same as fixed length prefix extractor, except that when slice is shorter than the fixed length, it will use the full key. Test Plan: Add a test case for db_test options_test and a new test Reviewers: yhchiang, rven, igor Reviewed By: igor Subscribers: MarkCallaghan, leveldb, dhruba, yoshinorim Differential Revision: https://reviews.facebook.net/D31887 --- HISTORY.md | 1 + Makefile | 5 + db/db_test.cc | 59 +++++++----- include/rocksdb/slice_transform.h | 27 ++++++ util/options_helper.cc | 27 ++++-- util/options_test.cc | 11 ++- util/slice.cc | 40 ++++++++ util/slice_transform_test.cc | 150 ++++++++++++++++++++++++++++++ 8 files changed, 286 insertions(+), 34 deletions(-) create mode 100644 util/slice_transform_test.cc diff --git a/HISTORY.md b/HISTORY.md index 22b0c05ab..c688585e5 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -12,6 +12,7 @@ * Added new block based table format (version 2), which you can enable by setting BlockBasedTableOptions.format_version = 2. This format changes how we encode size information in compressed blocks and should help with memory allocations if you're using Zlib or BZip2 compressions. * GetThreadStatus() is now able to report compaction activity. * MemEnv (env that stores data in memory) is now available in default library build. You can create it by calling NewMemEnv(). +* Add SliceTransform.SameResultWhenAppended() to help users determine it is safe to apply prefix bloom/hash. ### Public API changes * Deprecated skip_log_error_on_recovery option diff --git a/Makefile b/Makefile index 36814780b..1c0dea975 100644 --- a/Makefile +++ b/Makefile @@ -128,6 +128,7 @@ TESTS = \ coding_test \ corruption_test \ crc32c_test \ + slice_transform_test \ dbformat_test \ env_test \ fault_injection_test \ @@ -403,6 +404,10 @@ corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +slice_transform_test: util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + + db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) diff --git a/db/db_test.cc b/db/db_test.cc index 5941dc2d2..9014c9c86 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -389,28 +389,29 @@ class DBTest { kBlockBasedTableWithPrefixHashIndex = 1, kBlockBasedTableWithWholeKeyHashIndex = 2, kPlainTableFirstBytePrefix = 3, - kPlainTableAllBytesPrefix = 4, - kVectorRep = 5, - kHashLinkList = 6, - kHashCuckoo = 7, - kMergePut = 8, - kFilter = 9, - kFullFilter = 10, - kUncompressed = 11, - kNumLevel_3 = 12, - kDBLogDir = 13, - kWalDirAndMmapReads = 14, - kManifestFileSize = 15, - kCompactOnFlush = 16, - kPerfOptions = 17, - kDeletesFilterFirst = 18, - kHashSkipList = 19, - kUniversalCompaction = 20, - kCompressedBlockCache = 21, - kInfiniteMaxOpenFiles = 22, - kxxHashChecksum = 23, - kFIFOCompaction = 24, - kEnd = 25 + kPlainTableCappedPrefix = 4, + kPlainTableAllBytesPrefix = 5, + kVectorRep = 6, + kHashLinkList = 7, + kHashCuckoo = 8, + kMergePut = 9, + kFilter = 10, + kFullFilter = 11, + kUncompressed = 12, + kNumLevel_3 = 13, + kDBLogDir = 14, + kWalDirAndMmapReads = 15, + kManifestFileSize = 16, + kCompactOnFlush = 17, + kPerfOptions = 18, + kDeletesFilterFirst = 19, + kHashSkipList = 20, + kUniversalCompaction = 21, + kCompressedBlockCache = 22, + kInfiniteMaxOpenFiles = 23, + kxxHashChecksum = 24, + kFIFOCompaction = 25, + kEnd = 26 }; int option_config_; @@ -483,9 +484,10 @@ class DBTest { option_config_ == kHashSkipList)) {; continue; } - if ((skip_mask & kSkipPlainTable) - && (option_config_ == kPlainTableAllBytesPrefix - || option_config_ == kPlainTableFirstBytePrefix)) { + if ((skip_mask & kSkipPlainTable) && + (option_config_ == kPlainTableAllBytesPrefix || + option_config_ == kPlainTableFirstBytePrefix || + option_config_ == kPlainTableCappedPrefix)) { continue; } if ((skip_mask & kSkipHashIndex) && @@ -577,6 +579,13 @@ class DBTest { options.max_sequential_skip_in_iterations = 999999; set_block_based_table_factory = false; break; + case kPlainTableCappedPrefix: + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewCappedPrefixTransform(8)); + options.allow_mmap_reads = true; + options.max_sequential_skip_in_iterations = 999999; + set_block_based_table_factory = false; + break; case kPlainTableAllBytesPrefix: options.table_factory.reset(new PlainTableFactory()); options.prefix_extractor.reset(NewNoopTransform()); diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h index a78455001..c51dd8cb8 100644 --- a/include/rocksdb/slice_transform.h +++ b/include/rocksdb/slice_transform.h @@ -36,10 +36,37 @@ class SliceTransform { // determine whether dst=Transform(src) for some src virtual bool InRange(const Slice& dst) const = 0; + + // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix. + // + // This function is not used by RocksDB, but for users. If users pass + // Options by string to RocksDB, they might not know what prefix extractor + // they are using. This function is to help users can determine: + // if they want to iterate all keys prefixing `prefix`, whetherit is + // safe to use prefix bloom filter and seek to key `prefix`. + // Only returning false indicates it is correct to do that. + // + // Here is an example: Suppose we implement a slice transform that returns + // the first part of the string after spliting it using deimiter ",": + // 1. SameResultWhenAppended("abc,") should return true. If aplying prefix + // bloom filter using it, all slices matching "abc:.*" will be extracted + // to "abc,", so any SST file or memtable containing any of those key + // will not be filtered out. + // 2. SameResultWhenAppended("abc") should return false. A user will not + // guaranteed to see all the keys matching "abc.*" if a user seek to "abc" + // against a DB with the same setting. If one SST file only contains + // "abcd,e", the file can be filtered out and the key will be invisible. + // + // i.e., an implementation always returning false is safe. + virtual bool SameResultWhenAppended(const Slice& prefix) const { + return false; + } }; extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len); +extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len); + extern const SliceTransform* NewNoopTransform(); } diff --git a/util/options_helper.cc b/util/options_helper.cc index efc028497..d720a91e6 100644 --- a/util/options_helper.cc +++ b/util/options_helper.cc @@ -510,14 +510,27 @@ Status GetColumnFamilyOptionsFromMap( } else if (o.first == "inplace_update_support") { new_options->inplace_update_support = ParseBoolean(o.first, o.second); } else if (o.first == "prefix_extractor") { - const std::string kName = "fixed:"; - if (o.second.compare(0, kName.size(), kName) != 0) { - return Status::InvalidArgument("Invalid Prefix Extractor type: " - + o.second); + const std::string kFixedPrefixName = "fixed:"; + const std::string kCappedPrefixName = "capped:"; + auto& pe_value = o.second; + if (pe_value.size() > kFixedPrefixName.size() && + pe_value.compare(0, kFixedPrefixName.size(), kFixedPrefixName) == + 0) { + int prefix_length = + ParseInt(trim(o.second.substr(kFixedPrefixName.size()))); + new_options->prefix_extractor.reset( + NewFixedPrefixTransform(prefix_length)); + } else if (pe_value.size() > kCappedPrefixName.size() && + pe_value.compare(0, kCappedPrefixName.size(), + kCappedPrefixName) == 0) { + int prefix_length = + ParseInt(trim(pe_value.substr(kCappedPrefixName.size()))); + new_options->prefix_extractor.reset( + NewCappedPrefixTransform(prefix_length)); + } else { + return Status::InvalidArgument("Invalid Prefix Extractor type: " + + pe_value); } - int prefix_length = ParseInt(trim(o.second.substr(kName.size()))); - new_options->prefix_extractor.reset( - NewFixedPrefixTransform(prefix_length)); } else { return Status::InvalidArgument("Unrecognized option: " + o.first); } diff --git a/util/options_test.cc b/util/options_test.cc index cd26b0211..5ddfac27c 100644 --- a/util/options_test.cc +++ b/util/options_test.cc @@ -330,10 +330,17 @@ TEST(OptionsTest, GetOptionsFromStringTest) { ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16*1024*1024); ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17*1024UL*1024UL); // Units (g) - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=18g;arena_block_size=19G", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + base_cf_opt, + "write_buffer_size=18g;prefix_extractor=capped:8;" + "arena_block_size=19G", + &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 18*1024UL*1024UL*1024UL); ASSERT_EQ(new_cf_opt.arena_block_size, 19*1024UL*1024UL*1024UL); + ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr); + std::string prefix_name(new_cf_opt.prefix_extractor->Name()); + ASSERT_EQ(prefix_name, "rocksdb.CappedPrefix.8"); + // Units (t) ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt)); diff --git a/util/slice.cc b/util/slice.cc index cd197ced5..734ea974b 100644 --- a/util/slice.cc +++ b/util/slice.cc @@ -39,6 +39,38 @@ class FixedPrefixTransform : public SliceTransform { virtual bool InRange(const Slice& dst) const { return (dst.size() == prefix_len_); } + + virtual bool SameResultWhenAppended(const Slice& prefix) const { + return InDomain(prefix); + } +}; + +class CappedPrefixTransform : public SliceTransform { + private: + size_t cap_len_; + std::string name_; + + public: + explicit CappedPrefixTransform(size_t cap_len) + : cap_len_(cap_len), + name_("rocksdb.CappedPrefix." + ToString(cap_len_)) {} + + virtual const char* Name() const { return name_.c_str(); } + + virtual Slice Transform(const Slice& src) const { + assert(InDomain(src)); + return Slice(src.data(), std::min(cap_len_, src.size())); + } + + virtual bool InDomain(const Slice& src) const { return true; } + + virtual bool InRange(const Slice& dst) const { + return (dst.size() <= cap_len_); + } + + virtual bool SameResultWhenAppended(const Slice& prefix) const { + return prefix.size() >= cap_len_; + } }; class NoopTransform : public SliceTransform { @@ -60,6 +92,10 @@ class NoopTransform : public SliceTransform { virtual bool InRange(const Slice& dst) const { return true; } + + virtual bool SameResultWhenAppended(const Slice& prefix) const { + return false; + } }; } @@ -68,6 +104,10 @@ const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) { return new FixedPrefixTransform(prefix_len); } +const SliceTransform* NewCappedPrefixTransform(size_t cap_len) { + return new CappedPrefixTransform(cap_len); +} + const SliceTransform* NewNoopTransform() { return new NoopTransform; } diff --git a/util/slice_transform_test.cc b/util/slice_transform_test.cc new file mode 100644 index 000000000..9f0e34b15 --- /dev/null +++ b/util/slice_transform_test.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/slice_transform.h" + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "util/testharness.h" + +namespace rocksdb { + +class SliceTransformTest {}; + +TEST(SliceTransformTest, CapPrefixTransform) { + std::string s; + s = "abcdefge"; + + unique_ptr transform; + + transform.reset(NewCappedPrefixTransform(6)); + ASSERT_EQ(transform->Transform(s).ToString(), "abcdef"); + ASSERT_TRUE(transform->SameResultWhenAppended("123456")); + ASSERT_TRUE(transform->SameResultWhenAppended("1234567")); + ASSERT_TRUE(!transform->SameResultWhenAppended("12345")); + + transform.reset(NewCappedPrefixTransform(8)); + ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge"); + + transform.reset(NewCappedPrefixTransform(10)); + ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge"); + + transform.reset(NewCappedPrefixTransform(0)); + ASSERT_EQ(transform->Transform(s).ToString(), ""); + + transform.reset(NewCappedPrefixTransform(0)); + ASSERT_EQ(transform->Transform("").ToString(), ""); +} + +class SliceTransformDBTest { + private: + std::string dbname_; + Env* env_; + DB* db_; + + public: + SliceTransformDBTest() : env_(Env::Default()), db_(nullptr) { + dbname_ = test::TmpDir() + "/slice_transform_db_test"; + ASSERT_OK(DestroyDB(dbname_, last_options_)); + } + + ~SliceTransformDBTest() { + delete db_; + ASSERT_OK(DestroyDB(dbname_, last_options_)); + } + + DB* db() { return db_; } + + // Return the current option configuration. + Options* GetOptions() { return &last_options_; } + + void DestroyAndReopen() { + // Destroy using last options + Destroy(); + ASSERT_OK(TryReopen()); + } + + void Destroy() { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, last_options_)); + } + + Status TryReopen() { + delete db_; + db_ = nullptr; + last_options_.create_if_missing = true; + + return DB::Open(last_options_, dbname_, &db_); + } + + Options last_options_; +}; + +namespace { +uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); +} +} // namespace + +TEST(SliceTransformDBTest, CapPrefix) { + last_options_.prefix_extractor.reset(NewCappedPrefixTransform(8)); + last_options_.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + last_options_.table_factory.reset(NewBlockBasedTableFactory(bbto)); + ASSERT_OK(TryReopen()); + + ReadOptions ro; + FlushOptions fo; + WriteOptions wo; + + ASSERT_OK(db()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(db()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(db()->Put(wo, "foo", "bar")); + ASSERT_OK(db()->Put(wo, "foo3", "bar3")); + ASSERT_OK(db()->Flush(fo)); + + unique_ptr iter(db()->NewIterator(ro)); + + iter->Seek("foo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value().ToString(), "bar"); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 0U); + + iter->Seek("foo2"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U); + + iter->Seek("barbarbar"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value().ToString(), "foo"); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U); + + iter->Seek("barfoofoo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 2U); + + iter->Seek("foobarbar"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 3U); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }