From f5f184228291cd885918555044276ba8dd4e9568 Mon Sep 17 00:00:00 2001 From: Tyler Harter Date: Tue, 13 Aug 2013 14:04:56 -0700 Subject: [PATCH] Prefix filters for scans (v4) Summary: Similar to v2 (db and table code understands prefixes), but use ReadOptions as in v3. Also, make the CreateFilter code faster and cleaner. Test Plan: make db_test; export LEVELDB_TESTS=PrefixScan; ./db_test Reviewers: dhruba Reviewed By: dhruba CC: haobo, emayanke Differential Revision: https://reviews.facebook.net/D12027 --- db/db_impl.cc | 20 +++-- db/db_test.cc | 129 ++++++++++++++++++++++++++++++ db/prefix_filter_iterator.h | 76 ++++++++++++++++++ include/leveldb/options.h | 42 +++++++++- include/leveldb/slice_transform.h | 41 ++++++++++ table/filter_block.cc | 101 ++++++++++++++++++----- table/filter_block.h | 24 ++++-- table/filter_block_test.cc | 18 +++-- table/table.cc | 63 ++++++++++++++- table/table.h | 2 + table/table_builder.cc | 2 +- util/fixed_prefix_transform.cc | 43 ++++++++++ util/options.cc | 2 + 13 files changed, 519 insertions(+), 44 deletions(-) create mode 100644 db/prefix_filter_iterator.h create mode 100644 include/leveldb/slice_transform.h create mode 100644 util/fixed_prefix_transform.cc diff --git a/db/db_impl.cc b/db/db_impl.cc index 6083746fa..e510c4505 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -23,6 +23,7 @@ #include "db/memtable.h" #include "db/memtablelist.h" #include "db/merge_helper.h" +#include "db/prefix_filter_iterator.h" #include "db/table_cache.h" #include "db/version_set.h" #include "db/write_batch_internal.h" @@ -2339,12 +2340,19 @@ bool DBImpl::KeyMayExist(const ReadOptions& options, Iterator* DBImpl::NewIterator(const ReadOptions& options) { SequenceNumber latest_snapshot; - Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); - return NewDBIterator( - &dbname_, env_, options_, user_comparator(), internal_iter, - (options.snapshot != nullptr - ? reinterpret_cast(options.snapshot)->number_ - : latest_snapshot)); + Iterator* iter = NewInternalIterator(options, &latest_snapshot); + iter = NewDBIterator( + &dbname_, env_, options_, user_comparator(), iter, + (options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot)); + if (options.prefix) { + // use extra wrapper to exclude any keys from the results which + // don't begin with the prefix + iter = new PrefixFilterIterator(iter, *options.prefix, + options_.prefix_extractor); + } + return iter; } const Snapshot* DBImpl::GetSnapshot() { diff --git a/db/db_test.cc b/db/db_test.cc index 18f2e8832..492ae04be 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3640,6 +3640,135 @@ TEST(DBTest, MultiGetEmpty) { } while (ChangeCompactOptions()); } +void PrefixScanInit(DBTest *dbtest) { + char buf[100]; + std::string keystr; + const int small_range_sstfiles = 5; + const int big_range_sstfiles = 5; + + // Generate 11 sst files with the following prefix ranges. + // GROUP 0: [0,10] (level 1) + // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0) + // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0) + // + // A seek with the previous API would do 11 random I/Os (to all the + // files). With the new API and a prefix filter enabled, we should + // only do 2 random I/O, to the 2 files containing the key. + + // GROUP 0 + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", 10); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->dbfull()->TEST_CompactMemTable(); + dbtest->dbfull()->CompactRange(nullptr, nullptr); // move to level 1 + + // GROUP 1 + for (int i = 1; i <= small_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", i); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", i+1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->dbfull()->TEST_CompactMemTable(); + } + + // GROUP 2 + for (int i = 1; i <= big_range_sstfiles; i++) { + std::string keystr; + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", + small_range_sstfiles+i+1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->dbfull()->TEST_CompactMemTable(); + } +} + +TEST(DBTest, PrefixScan) { + ReadOptions ro = ReadOptions(); + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.block_cache = NewLRUCache(0); // Prevent cache hits + options.filter_policy = NewBloomFilterPolicy(10); + options.prefix_extractor = NewFixedPrefixTransform(8); + options.whole_key_filtering = false; + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.disable_seek_compaction = true; + + // prefix specified, with blooms: 2 RAND I/Os + // SeekToFirst + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + ro.prefix = &prefix; + iter = db_->NewIterator(ro); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + assert(iter->key().starts_with(prefix)); + count++; + } + ASSERT_TRUE(iter->status().ok()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + + // prefix specified, with blooms: 2 RAND I/Os + // Seek + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + ro.prefix = &prefix; + iter = db_->NewIterator(ro); + for (iter->Seek(key); iter->Valid(); iter->Next()) { + assert(iter->key().starts_with(prefix)); + count++; + } + ASSERT_TRUE(iter->status().ok()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + + // no prefix specified: 11 RAND I/Os + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (! iter->key().starts_with(prefix)) { + break; + } + count++; + } + ASSERT_TRUE(iter->status().ok()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 11); + Close(); + delete options.filter_policy; + delete options.prefix_extractor; +} + std::string MakeKey(unsigned int num) { char buf[30]; snprintf(buf, sizeof(buf), "%016u", num); diff --git a/db/prefix_filter_iterator.h b/db/prefix_filter_iterator.h new file mode 100644 index 000000000..1e0fbe3ca --- /dev/null +++ b/db/prefix_filter_iterator.h @@ -0,0 +1,76 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Wrap an underlying iterator, but exclude any results not starting +// with a given prefix. Seeking to keys not beginning with the prefix +// is invalid, and SeekToLast is not implemented (that would be +// non-trivial), but otherwise this iterator will behave just like the +// underlying iterator would if there happened to be no non-matching +// keys in the dataset. + +#ifndef STORAGE_LEVELDB_DB_PREFIX_FILTER_ITERATOR_H_ +#define STORAGE_LEVELDB_DB_PREFIX_FILTER_ITERATOR_H_ + +#include "leveldb/iterator.h" + +namespace leveldb { + +class PrefixFilterIterator : public Iterator { + private: + Iterator* iter_; + const Slice &prefix_; + const SliceTransform *prefix_extractor_; + Status status_; + + public: + PrefixFilterIterator(Iterator* iter, + const Slice &prefix, + const SliceTransform* prefix_extractor) + : iter_(iter), prefix_(prefix), + prefix_extractor_(prefix_extractor), + status_(Status::OK()) { + if (prefix_extractor == nullptr) { + status_ = Status::InvalidArgument("A prefix filter may not be used " + "unless a function is also defined " + "for extracting prefixes"); + } else if (!prefix_extractor_->InRange(prefix)) { + status_ = Status::InvalidArgument("Must provide a slice for prefix which" + "is a prefix for some key"); + } + } + ~PrefixFilterIterator() { + delete iter_; + } + Slice key() const { return iter_->key(); } + Slice value() const { return iter_->value(); } + Status status() const { + if (!status_.ok()) { + return status_; + } + return iter_->status(); + } + void Next() { iter_->Next(); } + void Prev() { iter_->Prev(); } + void Seek(const Slice& k) { + if (prefix_extractor_->Transform(k) == prefix_) { + iter_->Seek(k); + } else { + status_ = Status::InvalidArgument("Seek must begin with target prefix"); + } + } + void SeekToFirst() { + Seek(prefix_); + } + void SeekToLast() { + status_ = Status::NotSupported("SeekToLast is incompatible with prefixes"); + } + bool Valid() const { + return (status_.ok() && iter_->Valid() && + prefix_extractor_->Transform(iter_->key()) == prefix_); + } +}; + +} // namespace leveldb + +#endif diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 30a8dc328..63899d8f7 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -15,6 +15,8 @@ #include "leveldb/universal_compaction.h" #include "leveldb/memtablerep.h" +#include "leveldb/slice_transform.h" + namespace leveldb { class Cache; @@ -224,6 +226,28 @@ struct Options { // Default: nullptr const FilterPolicy* filter_policy; + // If non-nullptr, use the specified function to determine the + // prefixes for keys. These prefixes will be placed in the filter. + // Depending on the workload, this can reduce the number of read-IOP + // cost for scans when a prefix is passed via ReadOptions to + // db.NewIterator(). For prefix filtering to work properly, + // "prefix_extractor" and "comparator" must be such that the following + // properties hold: + // + // 1) key.starts_with(prefix(key)) + // 2) Compare(prefix(key), key) <= 0. + // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0 + // 4) prefix(prefix(key)) == prefix(key) + // + // Default: nullptr + const SliceTransform* prefix_extractor; + + // If true, place whole keys in the filter (not just prefixes). + // This must generally be true for gets to be efficient. + // + // Default: true + bool whole_key_filtering; + // Number of levels for this database int num_levels; @@ -538,14 +562,28 @@ struct ReadOptions { // Default: nullptr const Snapshot* snapshot; + // If "prefix" is non-nullptr, and ReadOptions is being passed to + // db.NewIterator, only return results when the key begins with this + // prefix. This field is ignored by other calls (e.g., Get). + // Options.prefix_extractor must also be set, and + // prefix_extractor.InRange(prefix) must be true. The iterator + // returned by NewIterator when this option is set will behave just + // as if the underlying store did not contain any non-matching keys, + // with two exceptions. Seek() only accepts keys starting with the + // prefix, and SeekToLast() is not supported. prefix filter with this + // option will sometimes reduce the number of read IOPs. + // Default: nullptr + const Slice* prefix; + ReadOptions() : verify_checksums(false), fill_cache(true), - snapshot(nullptr) { + snapshot(nullptr), + prefix(nullptr) { } ReadOptions(bool cksum, bool cache) : verify_checksums(cksum), fill_cache(cache), - snapshot(nullptr) { + snapshot(nullptr), prefix(nullptr) { } }; diff --git a/include/leveldb/slice_transform.h b/include/leveldb/slice_transform.h new file mode 100644 index 000000000..fa94d2141 --- /dev/null +++ b/include/leveldb/slice_transform.h @@ -0,0 +1,41 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Class for specifying user-defined functions which perform a +// transformation on a slice. It is not required that every slice +// belong to the domain and/or range of a function. Subclasses should +// define InDomain and InRange to determine which slices are in either +// of these sets respectively. + +#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_TRANSFORM_H_ +#define STORAGE_LEVELDB_INCLUDE_SLICE_TRANSFORM_H_ + +#include + +namespace leveldb { + +class Slice; + +class SliceTransform { + public: + virtual ~SliceTransform() {}; + + // Return the name of this transformation. + virtual const char* Name() const = 0; + + // transform a src in domain to a dst in the range + virtual Slice Transform(const Slice& src) const = 0; + + // determine whether this is a valid src upon the function applies + virtual bool InDomain(const Slice& src) const = 0; + + // determine whether dst=Transform(src) for some src + virtual bool InRange(const Slice& dst) const = 0; +}; + +extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_SLICE_TRANSFORM_H_ diff --git a/table/filter_block.cc b/table/filter_block.cc index b9dfc5c0b..38f807cef 100644 --- a/table/filter_block.cc +++ b/table/filter_block.cc @@ -4,6 +4,7 @@ #include "table/filter_block.h" +#include "db/dbformat.h" #include "leveldb/filter_policy.h" #include "util/coding.h" @@ -15,9 +16,11 @@ namespace leveldb { static const size_t kFilterBaseLg = 11; static const size_t kFilterBase = 1 << kFilterBaseLg; -FilterBlockBuilder::FilterBlockBuilder(const FilterPolicy* policy) - : policy_(policy) { -} +FilterBlockBuilder::FilterBlockBuilder(const Options& opt) + : policy_(opt.filter_policy), + prefix_extractor_(opt.prefix_extractor), + whole_key_filtering_(opt.whole_key_filtering), + comparator_(opt.comparator){} void FilterBlockBuilder::StartBlock(uint64_t block_offset) { uint64_t filter_index = (block_offset / kFilterBase); @@ -27,10 +30,47 @@ void FilterBlockBuilder::StartBlock(uint64_t block_offset) { } } +bool FilterBlockBuilder::SamePrefix(const Slice &key1, + const Slice &key2) const { + if (!prefix_extractor_->InDomain(key1) && + !prefix_extractor_->InDomain(key2)) { + return true; + } else if (!prefix_extractor_->InDomain(key1) || + !prefix_extractor_->InDomain(key2)) { + return false; + } else { + return (prefix_extractor_->Transform(key1) == + prefix_extractor_->Transform(key2)); + } +} + void FilterBlockBuilder::AddKey(const Slice& key) { - Slice k = key; - start_.push_back(keys_.size()); - keys_.append(k.data(), k.size()); + Slice prev; + if (start_.size() > 0) { + size_t prev_start = start_[start_.size() - 1]; + const char* base = entries_.data() + prev_start; + size_t length = entries_.size() - prev_start; + prev = Slice(base, length); + } + + if (whole_key_filtering_) { + start_.push_back(entries_.size()); + entries_.append(key.data(), key.size()); + } + + if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { + // this assumes prefix(prefix(key)) == prefix(key), as the last + // entry in entries_ may be either a key or prefix, and we use + // prefix(last entry) to get the prefix of the last key. + if (prev.size() == 0 || ! SamePrefix(key, prev)) { + Slice prefix = prefix_extractor_->Transform(key); + assert(comparator_->Compare(prefix, key) <= 0); + InternalKey internal_prefix_tmp(prefix, 0, kTypeValue); + Slice internal_prefix = internal_prefix_tmp.Encode(); + start_.push_back(entries_.size()); + entries_.append(internal_prefix.data(), internal_prefix.size()); + } + } } Slice FilterBlockBuilder::Finish() { @@ -50,34 +90,35 @@ Slice FilterBlockBuilder::Finish() { } void FilterBlockBuilder::GenerateFilter() { - const size_t num_keys = start_.size(); - if (num_keys == 0) { + const size_t num_entries = start_.size(); + if (num_entries == 0) { // Fast path if there are no keys for this filter filter_offsets_.push_back(result_.size()); return; } // Make list of keys from flattened key structure - start_.push_back(keys_.size()); // Simplify length computation - tmp_keys_.resize(num_keys); - for (size_t i = 0; i < num_keys; i++) { - const char* base = keys_.data() + start_[i]; + start_.push_back(entries_.size()); // Simplify length computation + tmp_entries_.resize(num_entries); + for (size_t i = 0; i < num_entries; i++) { + const char* base = entries_.data() + start_[i]; size_t length = start_[i+1] - start_[i]; - tmp_keys_[i] = Slice(base, length); + tmp_entries_[i] = Slice(base, length); } // Generate filter for current set of keys and append to result_. filter_offsets_.push_back(result_.size()); - policy_->CreateFilter(&tmp_keys_[0], num_keys, &result_); + policy_->CreateFilter(&tmp_entries_[0], num_entries, &result_); - tmp_keys_.clear(); - keys_.clear(); + tmp_entries_.clear(); + entries_.clear(); start_.clear(); } -FilterBlockReader::FilterBlockReader(const FilterPolicy* policy, - const Slice& contents) - : policy_(policy), +FilterBlockReader::FilterBlockReader(const Options& opt, const Slice& contents) + : policy_(opt.filter_policy), + prefix_extractor_(opt.prefix_extractor), + whole_key_filtering_(opt.whole_key_filtering), data_(nullptr), offset_(nullptr), num_(0), @@ -92,16 +133,32 @@ FilterBlockReader::FilterBlockReader(const FilterPolicy* policy, num_ = (n - 5 - last_word) / 4; } -bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) { +bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, + const Slice& key) { + if (!whole_key_filtering_) { + return true; + } + return MayMatch(block_offset, key); +} + +bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset, + const Slice& prefix) { + if (!prefix_extractor_) { + return true; + } + return MayMatch(block_offset, prefix); +} + +bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) { uint64_t index = block_offset >> base_lg_; if (index < num_) { uint32_t start = DecodeFixed32(offset_ + index*4); uint32_t limit = DecodeFixed32(offset_ + index*4 + 4); if (start <= limit && limit <= (offset_ - data_)) { Slice filter = Slice(data_ + start, limit - start); - return policy_->KeyMayMatch(key, filter); + return policy_->KeyMayMatch(entry, filter); } else if (start == limit) { - // Empty filters do not match any keys + // Empty filters do not match any entries return false; } } diff --git a/table/filter_block.h b/table/filter_block.h index c67d010bd..7e6982188 100644 --- a/table/filter_block.h +++ b/table/filter_block.h @@ -13,7 +13,9 @@ #include #include #include +#include "leveldb/options.h" #include "leveldb/slice.h" +#include "leveldb/slice_transform.h" #include "util/hash.h" namespace leveldb { @@ -28,20 +30,25 @@ class FilterPolicy; // (StartBlock AddKey*)* Finish class FilterBlockBuilder { public: - explicit FilterBlockBuilder(const FilterPolicy*); + explicit FilterBlockBuilder(const Options& opt); void StartBlock(uint64_t block_offset); void AddKey(const Slice& key); Slice Finish(); private: + bool SamePrefix(const Slice &key1, const Slice &key2) const; void GenerateFilter(); const FilterPolicy* policy_; - std::string keys_; // Flattened key contents - std::vector start_; // Starting index in keys_ of each key - std::string result_; // Filter data computed so far - std::vector tmp_keys_; // policy_->CreateFilter() argument + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + const Comparator* comparator_; + + std::string entries_; // Flattened entry contents + std::vector start_; // Starting index in entries_ of each entry + std::string result_; // Filter data computed so far + std::vector tmp_entries_; // policy_->CreateFilter() argument std::vector filter_offsets_; // No copying allowed @@ -52,15 +59,20 @@ class FilterBlockBuilder { class FilterBlockReader { public: // REQUIRES: "contents" and *policy must stay live while *this is live. - FilterBlockReader(const FilterPolicy* policy, const Slice& contents); + FilterBlockReader(const Options& opt, const Slice& contents); bool KeyMayMatch(uint64_t block_offset, const Slice& key); + bool PrefixMayMatch(uint64_t block_offset, const Slice& prefix); private: const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; const char* data_; // Pointer to filter data (at block-start) const char* offset_; // Pointer to beginning of offset array (at block-end) size_t num_; // Number of entries in offset array size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) + + bool MayMatch(uint64_t block_offset, const Slice& entry); }; } diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc index cadb273b0..b61770fdf 100644 --- a/table/filter_block_test.cc +++ b/table/filter_block_test.cc @@ -41,19 +41,25 @@ class TestHashFilter : public FilterPolicy { class FilterBlockTest { public: TestHashFilter policy_; + Options options_; + + FilterBlockTest() { + options_ = Options(); + options_.filter_policy = &policy_; + } }; TEST(FilterBlockTest, EmptyBuilder) { - FilterBlockBuilder builder(&policy_); + FilterBlockBuilder builder(options_); Slice block = builder.Finish(); ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block)); - FilterBlockReader reader(&policy_, block); + FilterBlockReader reader(options_, block); ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); ASSERT_TRUE(reader.KeyMayMatch(100000, "foo")); } TEST(FilterBlockTest, SingleChunk) { - FilterBlockBuilder builder(&policy_); + FilterBlockBuilder builder(options_); builder.StartBlock(100); builder.AddKey("foo"); builder.AddKey("bar"); @@ -63,7 +69,7 @@ TEST(FilterBlockTest, SingleChunk) { builder.StartBlock(300); builder.AddKey("hello"); Slice block = builder.Finish(); - FilterBlockReader reader(&policy_, block); + FilterBlockReader reader(options_, block); ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); ASSERT_TRUE(reader.KeyMayMatch(100, "bar")); ASSERT_TRUE(reader.KeyMayMatch(100, "box")); @@ -74,7 +80,7 @@ TEST(FilterBlockTest, SingleChunk) { } TEST(FilterBlockTest, MultiChunk) { - FilterBlockBuilder builder(&policy_); + FilterBlockBuilder builder(options_); // First filter builder.StartBlock(0); @@ -94,7 +100,7 @@ TEST(FilterBlockTest, MultiChunk) { builder.AddKey("hello"); Slice block = builder.Finish(); - FilterBlockReader reader(&policy_, block); + FilterBlockReader reader(options_, block); // Check first filter ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); diff --git a/table/table.cc b/table/table.cc index be0f43c90..9a5f1b761 100644 --- a/table/table.cc +++ b/table/table.cc @@ -4,6 +4,8 @@ #include "table/table.h" +#include "db/dbformat.h" + #include "leveldb/cache.h" #include "leveldb/comparator.h" #include "leveldb/env.h" @@ -207,7 +209,7 @@ void Table::ReadFilter(const Slice& filter_handle_value) { if (block.heap_allocated) { rep_->filter_data = block.data.data(); // Will need to delete later } - rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data); + rep_->filter = new FilterBlockReader(rep_->options, block.data); } Table::~Table() { @@ -318,7 +320,66 @@ Iterator* Table::BlockReader(void* arg, return BlockReader(arg, options, index_value, nullptr, for_compaction); } +// This will be broken if the user specifies an unusual implementation +// of Options.comparator, or if the user specifies an unusual +// definition of prefixes in Options.filter_policy. In particular, we +// require the following three properties: +// +// 1) key.starts_with(prefix(key)) +// 2) Compare(prefix(key), key) <= 0. +// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0 +bool Table::PrefixMayMatch(const Slice& internal_prefix) const { + FilterBlockReader* filter = rep_->filter; + bool may_match = true; + Status s; + + if (filter == nullptr) { + return true; + } + + Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); + iiter->Seek(internal_prefix); + if (! iiter->Valid()) { + // we're past end of file + may_match = false; + } else if (iiter->key().starts_with(internal_prefix)) { + // we need to check for this subtle case because our only + // guarantee is that "the key is a string >= last key in that data + // block" according to the doc/table_format.txt spec. + // + // Suppose iiter->key() starts with the desired prefix; it is not + // necessarily the case that the corresponding data block will + // contain the prefix, since iiter->key() need not be in the + // block. However, the next data block may contain the prefix, so + // we return true to play it safe. + may_match = true; + } else { + // iiter->key() does NOT start with the desired prefix. Because + // Seek() finds the first key that is >= the seek target, this + // means that iiter->key() > prefix. Thus, any data blocks coming + // after the data block corresponding to iiter->key() cannot + // possibly contain the key. Thus, the corresponding data block + // is the only one which could potentially contain the prefix. + Slice handle_value = iiter->value(); + BlockHandle handle; + s = handle.DecodeFrom(&handle_value); + assert(s.ok()); + may_match = filter->PrefixMayMatch(handle.offset(), internal_prefix); + } + delete iiter; + return may_match; +} + Iterator* Table::NewIterator(const ReadOptions& options) const { + if (options.prefix) { + InternalKey internal_prefix(*options.prefix, 0, kTypeValue); + if (!PrefixMayMatch(internal_prefix.Encode())) { + // nothing in this file can match the prefix, so we should not + // bother doing I/O to this file when iterating. + return NewEmptyIterator(); + } + } + return NewTwoLevelIterator( rep_->index_block->NewIterator(rep_->options.comparator), &Table::BlockReader, const_cast(this), options, rep_->soptions); diff --git a/table/table.h b/table/table.h index b39a5c186..c2cb5a32b 100644 --- a/table/table.h +++ b/table/table.h @@ -47,6 +47,8 @@ class Table { ~Table(); + bool PrefixMayMatch(const Slice& prefix) const; + // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). diff --git a/table/table_builder.cc b/table/table_builder.cc index 078ffedf2..4e6b047e6 100644 --- a/table/table_builder.cc +++ b/table/table_builder.cc @@ -55,7 +55,7 @@ struct TableBuilder::Rep { num_entries(0), closed(false), filter_block(opt.filter_policy == nullptr ? nullptr - : new FilterBlockBuilder(opt.filter_policy)), + : new FilterBlockBuilder(opt)), pending_index_entry(false) { index_block_options.block_restart_interval = 1; } diff --git a/util/fixed_prefix_transform.cc b/util/fixed_prefix_transform.cc new file mode 100644 index 000000000..2eadb8b0a --- /dev/null +++ b/util/fixed_prefix_transform.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/slice_transform.h" + +#include "leveldb/slice.h" + +namespace leveldb { + +namespace { + +class FixedPrefixTransform : public SliceTransform { + private: + size_t prefix_len_; + + public: + explicit FixedPrefixTransform(size_t prefix_len) : prefix_len_(prefix_len) { } + + virtual const char* Name() const { + return "rocksdb.FixedPrefix"; + } + + virtual Slice Transform(const Slice& src) const { + assert(InDomain(src)); + return Slice(src.data(), prefix_len_); + } + + virtual bool InDomain(const Slice& src) const { + return (src.size() >= prefix_len_); + } + + virtual bool InRange(const Slice& dst) const { + return (dst.size() == prefix_len_); + } +}; +} + +const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) { + return new FixedPrefixTransform(prefix_len); +} + +} // namespace leveldb diff --git a/util/options.cc b/util/options.cc index cc5653326..56bc0ca73 100644 --- a/util/options.cc +++ b/util/options.cc @@ -33,6 +33,8 @@ Options::Options() block_restart_interval(16), compression(kSnappyCompression), filter_policy(nullptr), + prefix_extractor(nullptr), + whole_key_filtering(true), num_levels(7), level0_file_num_compaction_trigger(4), level0_slowdown_writes_trigger(8),