From f5f184228291cd885918555044276ba8dd4e9568 Mon Sep 17 00:00:00 2001
From: Tyler Harter <tylerharter@dev685.prn1.facebook.com>
Date: Tue, 13 Aug 2013 14:04:56 -0700
Subject: [PATCH] Prefix filters for scans (v4)

Summary: Similar to v2 (db and table code understands prefixes), but use ReadOptions as in v3.  Also, make the CreateFilter code faster and cleaner.

Test Plan: make db_test; export LEVELDB_TESTS=PrefixScan; ./db_test

Reviewers: dhruba

Reviewed By: dhruba

CC: haobo, emayanke

Differential Revision: https://reviews.facebook.net/D12027
---
 db/db_impl.cc                     |  20 +++--
 db/db_test.cc                     | 129 ++++++++++++++++++++++++++++++
 db/prefix_filter_iterator.h       |  76 ++++++++++++++++++
 include/leveldb/options.h         |  42 +++++++++-
 include/leveldb/slice_transform.h |  41 ++++++++++
 table/filter_block.cc             | 101 ++++++++++++++++++-----
 table/filter_block.h              |  24 ++++--
 table/filter_block_test.cc        |  18 +++--
 table/table.cc                    |  63 ++++++++++++++-
 table/table.h                     |   2 +
 table/table_builder.cc            |   2 +-
 util/fixed_prefix_transform.cc    |  43 ++++++++++
 util/options.cc                   |   2 +
 13 files changed, 519 insertions(+), 44 deletions(-)
 create mode 100644 db/prefix_filter_iterator.h
 create mode 100644 include/leveldb/slice_transform.h
 create mode 100644 util/fixed_prefix_transform.cc
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 6083746fa..e510c4505 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -23,6 +23,7 @@
 #include "db/memtable.h"
 #include "db/memtablelist.h"
 #include "db/merge_helper.h"
+#include "db/prefix_filter_iterator.h"
 #include "db/table_cache.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
@@ -2339,12 +2340,19 @@ bool DBImpl::KeyMayExist(const ReadOptions& options,
 
 Iterator* DBImpl::NewIterator(const ReadOptions& options) {
   SequenceNumber latest_snapshot;
-  Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
-  return NewDBIterator(
-      &dbname_, env_, options_, user_comparator(), internal_iter,
-      (options.snapshot != nullptr
-       ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
-       : latest_snapshot));
+  Iterator* iter = NewInternalIterator(options, &latest_snapshot);
+  iter = NewDBIterator(
+             &dbname_, env_, options_, user_comparator(), iter,
+             (options.snapshot != nullptr
+              ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+              : latest_snapshot));
+  if (options.prefix) {
+    // use extra wrapper to exclude any keys from the results which
+    // don't begin with the prefix
+    iter = new PrefixFilterIterator(iter, *options.prefix,
+                                    options_.prefix_extractor);
+  }
+  return iter;
 }
 
 const Snapshot* DBImpl::GetSnapshot() {
diff --git a/db/db_test.cc b/db/db_test.cc
index 18f2e8832..492ae04be 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3640,6 +3640,135 @@ TEST(DBTest, MultiGetEmpty) {
   } while (ChangeCompactOptions());
 }
 
+void PrefixScanInit(DBTest *dbtest) {
+  char buf[100];
+  std::string keystr;
+  const int small_range_sstfiles = 5;
+  const int big_range_sstfiles = 5;
+
+  // Generate 11 sst files with the following prefix ranges.
+  // GROUP 0: [0,10]                              (level 1)
+  // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6]  (level 0)
+  // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10]  (level 0)
+  //
+  // A seek with the previous API would do 11 random I/Os (to all the
+  // files).  With the new API and a prefix filter enabled, we should
+  // only do 2 random I/O, to the 2 files containing the key.
+
+  // GROUP 0
+  snprintf(buf, sizeof(buf), "%02d______:start", 0);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  snprintf(buf, sizeof(buf), "%02d______:end", 10);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  dbtest->dbfull()->TEST_CompactMemTable();
+  dbtest->dbfull()->CompactRange(nullptr, nullptr); // move to level 1
+
+  // GROUP 1
+  for (int i = 1; i <= small_range_sstfiles; i++) {
+    snprintf(buf, sizeof(buf), "%02d______:start", i);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end", i+1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->dbfull()->TEST_CompactMemTable();
+  }
+
+  // GROUP 2
+  for (int i = 1; i <= big_range_sstfiles; i++) {
+    std::string keystr;
+    snprintf(buf, sizeof(buf), "%02d______:start", 0);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end",
+             small_range_sstfiles+i+1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->dbfull()->TEST_CompactMemTable();
+  }
+}
+
+TEST(DBTest, PrefixScan) {
+  ReadOptions ro = ReadOptions();
+  int count;
+  Slice prefix;
+  Slice key;
+  char buf[100];
+  Iterator* iter;
+  snprintf(buf, sizeof(buf), "03______:");
+  prefix = Slice(buf, 8);
+  key = Slice(buf, 9);
+
+  // db configs
+  env_->count_random_reads_ = true;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.block_cache = NewLRUCache(0);  // Prevent cache hits
+  options.filter_policy =  NewBloomFilterPolicy(10);
+  options.prefix_extractor = NewFixedPrefixTransform(8);
+  options.whole_key_filtering = false;
+  options.disable_auto_compactions = true;
+  options.max_background_compactions = 2;
+  options.create_if_missing = true;
+  options.disable_seek_compaction = true;
+
+  // prefix specified, with blooms: 2 RAND I/Os
+  // SeekToFirst
+  DestroyAndReopen(&options);
+  PrefixScanInit(this);
+  count = 0;
+  env_->random_read_counter_.Reset();
+  ro.prefix = &prefix;
+  iter = db_->NewIterator(ro);
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    assert(iter->key().starts_with(prefix));
+    count++;
+  }
+  ASSERT_TRUE(iter->status().ok());
+  delete iter;
+  ASSERT_EQ(count, 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+
+  // prefix specified, with blooms: 2 RAND I/Os
+  // Seek
+  DestroyAndReopen(&options);
+  PrefixScanInit(this);
+  count = 0;
+  env_->random_read_counter_.Reset();
+  ro.prefix = &prefix;
+  iter = db_->NewIterator(ro);
+  for (iter->Seek(key); iter->Valid(); iter->Next()) {
+    assert(iter->key().starts_with(prefix));
+    count++;
+  }
+  ASSERT_TRUE(iter->status().ok());
+  delete iter;
+  ASSERT_EQ(count, 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+
+  // no prefix specified: 11 RAND I/Os
+  DestroyAndReopen(&options);
+  PrefixScanInit(this);
+  count = 0;
+  env_->random_read_counter_.Reset();
+  iter = db_->NewIterator(ReadOptions());
+  for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
+    if (! iter->key().starts_with(prefix)) {
+      break;
+    }
+    count++;
+  }
+  ASSERT_TRUE(iter->status().ok());
+  delete iter;
+  ASSERT_EQ(count, 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), 11);
+  Close();
+  delete options.filter_policy;
+  delete options.prefix_extractor;
+}
+
 std::string MakeKey(unsigned int num) {
   char buf[30];
   snprintf(buf, sizeof(buf), "%016u", num);
diff --git a/db/prefix_filter_iterator.h b/db/prefix_filter_iterator.h
new file mode 100644
index 000000000..1e0fbe3ca
--- /dev/null
+++ b/db/prefix_filter_iterator.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Wrap an underlying iterator, but exclude any results not starting
+// with a given prefix.  Seeking to keys not beginning with the prefix
+// is invalid, and SeekToLast is not implemented (that would be
+// non-trivial), but otherwise this iterator will behave just like the
+// underlying iterator would if there happened to be no non-matching
+// keys in the dataset.
+
+#ifndef STORAGE_LEVELDB_DB_PREFIX_FILTER_ITERATOR_H_
+#define STORAGE_LEVELDB_DB_PREFIX_FILTER_ITERATOR_H_
+
+#include "leveldb/iterator.h"
+
+namespace leveldb {
+
+class PrefixFilterIterator : public Iterator {
+ private:
+  Iterator* iter_;
+  const Slice &prefix_;
+  const SliceTransform *prefix_extractor_;
+  Status status_;
+
+ public:
+  PrefixFilterIterator(Iterator* iter,
+                       const Slice &prefix,
+                       const SliceTransform* prefix_extractor)
+                             : iter_(iter), prefix_(prefix),
+                               prefix_extractor_(prefix_extractor),
+                               status_(Status::OK()) {
+    if (prefix_extractor == nullptr) {
+      status_ = Status::InvalidArgument("A prefix filter may not be used "
+                                        "unless a function is also defined "
+                                        "for extracting prefixes");
+    } else if (!prefix_extractor_->InRange(prefix)) {
+      status_ = Status::InvalidArgument("Must provide a slice for prefix which"
+                                        "is a prefix for some key");
+    }
+  }
+  ~PrefixFilterIterator() {
+    delete iter_;
+  }
+  Slice key() const { return iter_->key(); }
+  Slice value() const { return iter_->value(); }
+  Status status() const {
+    if (!status_.ok()) {
+      return status_;
+    }
+    return iter_->status();
+  }
+  void Next() { iter_->Next(); }
+  void Prev() { iter_->Prev(); }
+  void Seek(const Slice& k) {
+    if (prefix_extractor_->Transform(k) == prefix_) {
+      iter_->Seek(k);
+    } else {
+      status_ = Status::InvalidArgument("Seek must begin with target prefix");
+    }
+  }
+  void SeekToFirst() {
+    Seek(prefix_);
+  }
+  void SeekToLast() {
+    status_ = Status::NotSupported("SeekToLast is incompatible with prefixes");
+  }
+  bool Valid() const {
+    return (status_.ok() && iter_->Valid() &&
+            prefix_extractor_->Transform(iter_->key()) == prefix_);
+  }
+};
+
+}  // namespace leveldb
+
+#endif
diff --git a/include/leveldb/options.h b/include/leveldb/options.h
index 30a8dc328..63899d8f7 100644
--- a/include/leveldb/options.h
+++ b/include/leveldb/options.h
@@ -15,6 +15,8 @@
 #include "leveldb/universal_compaction.h"
 #include "leveldb/memtablerep.h"
 
+#include "leveldb/slice_transform.h"
+
 namespace leveldb {
 
 class Cache;
@@ -224,6 +226,28 @@ struct Options {
   // Default: nullptr
   const FilterPolicy* filter_policy;
 
+  // If non-nullptr, use the specified function to determine the
+  // prefixes for keys.  These prefixes will be placed in the filter.
+  // Depending on the workload, this can reduce the number of read-IOP
+  // cost for scans when a prefix is passed via ReadOptions to
+  // db.NewIterator().  For prefix filtering to work properly,
+  // "prefix_extractor" and "comparator" must be such that the following
+  // properties hold:
+  //
+  // 1) key.starts_with(prefix(key))
+  // 2) Compare(prefix(key), key) <= 0.
+  // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
+  // 4) prefix(prefix(key)) == prefix(key)
+  //
+  // Default: nullptr
+  const SliceTransform* prefix_extractor;
+
+  // If true, place whole keys in the filter (not just prefixes).
+  // This must generally be true for gets to be efficient.
+  //
+  // Default: true
+  bool whole_key_filtering;
+
   // Number of levels for this database
   int num_levels;
 
@@ -538,14 +562,28 @@ struct ReadOptions {
   // Default: nullptr
   const Snapshot* snapshot;
 
+  // If "prefix" is non-nullptr, and ReadOptions is being passed to
+  // db.NewIterator, only return results when the key begins with this
+  // prefix.  This field is ignored by other calls (e.g., Get).
+  // Options.prefix_extractor must also be set, and
+  // prefix_extractor.InRange(prefix) must be true.  The iterator
+  // returned by NewIterator when this option is set will behave just
+  // as if the underlying store did not contain any non-matching keys,
+  // with two exceptions.  Seek() only accepts keys starting with the
+  // prefix, and SeekToLast() is not supported.  prefix filter with this
+  // option will sometimes reduce the number of read IOPs.
+  // Default: nullptr
+  const Slice* prefix;
+
   ReadOptions()
       : verify_checksums(false),
         fill_cache(true),
-        snapshot(nullptr) {
+        snapshot(nullptr),
+        prefix(nullptr) {
   }
   ReadOptions(bool cksum, bool cache) :
               verify_checksums(cksum), fill_cache(cache),
-              snapshot(nullptr) {
+              snapshot(nullptr), prefix(nullptr) {
   }
 };
 
diff --git a/include/leveldb/slice_transform.h b/include/leveldb/slice_transform.h
new file mode 100644
index 000000000..fa94d2141
--- /dev/null
+++ b/include/leveldb/slice_transform.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Class for specifying user-defined functions which perform a
+// transformation on a slice.  It is not required that every slice
+// belong to the domain and/or range of a function.  Subclasses should
+// define InDomain and InRange to determine which slices are in either
+// of these sets respectively.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_TRANSFORM_H_
+#define STORAGE_LEVELDB_INCLUDE_SLICE_TRANSFORM_H_
+
+#include <string>
+
+namespace leveldb {
+
+class Slice;
+
+class SliceTransform {
+ public:
+  virtual ~SliceTransform() {};
+
+  // Return the name of this transformation.
+  virtual const char* Name() const = 0;
+
+  // transform a src in domain to a dst in the range
+  virtual Slice Transform(const Slice& src) const = 0;
+
+  // determine whether this is a valid src upon the function applies
+  virtual bool InDomain(const Slice& src) const = 0;
+
+  // determine whether dst=Transform(src) for some src
+  virtual bool InRange(const Slice& dst) const = 0;
+};
+
+extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
+
+}
+
+#endif  // STORAGE_LEVELDB_INCLUDE_SLICE_TRANSFORM_H_
diff --git a/table/filter_block.cc b/table/filter_block.cc
index b9dfc5c0b..38f807cef 100644
--- a/table/filter_block.cc
+++ b/table/filter_block.cc
@@ -4,6 +4,7 @@
 
 #include "table/filter_block.h"
 
+#include "db/dbformat.h"
 #include "leveldb/filter_policy.h"
 #include "util/coding.h"
 
@@ -15,9 +16,11 @@ namespace leveldb {
 static const size_t kFilterBaseLg = 11;
 static const size_t kFilterBase = 1 << kFilterBaseLg;
 
-FilterBlockBuilder::FilterBlockBuilder(const FilterPolicy* policy)
-    : policy_(policy) {
-}
+FilterBlockBuilder::FilterBlockBuilder(const Options& opt)
+                 : policy_(opt.filter_policy),
+                   prefix_extractor_(opt.prefix_extractor),
+                   whole_key_filtering_(opt.whole_key_filtering),
+                   comparator_(opt.comparator){}
 
 void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
   uint64_t filter_index = (block_offset / kFilterBase);
@@ -27,10 +30,47 @@ void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
   }
 }
 
+bool FilterBlockBuilder::SamePrefix(const Slice &key1,
+                                    const Slice &key2) const {
+  if (!prefix_extractor_->InDomain(key1) &&
+      !prefix_extractor_->InDomain(key2)) {
+    return true;
+  } else if (!prefix_extractor_->InDomain(key1) ||
+             !prefix_extractor_->InDomain(key2)) {
+    return false;
+  } else {
+    return (prefix_extractor_->Transform(key1) ==
+            prefix_extractor_->Transform(key2));
+  }
+}
+
 void FilterBlockBuilder::AddKey(const Slice& key) {
-  Slice k = key;
-  start_.push_back(keys_.size());
-  keys_.append(k.data(), k.size());
+  Slice prev;
+  if (start_.size() > 0) {
+    size_t prev_start = start_[start_.size() - 1];
+    const char* base = entries_.data() + prev_start;
+    size_t length = entries_.size() - prev_start;
+    prev = Slice(base, length);
+  }
+
+  if (whole_key_filtering_) {
+    start_.push_back(entries_.size());
+    entries_.append(key.data(), key.size());
+  }
+
+  if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
+    // this assumes prefix(prefix(key)) == prefix(key), as the last
+    // entry in entries_ may be either a key or prefix, and we use
+    // prefix(last entry) to get the prefix of the last key.
+    if (prev.size() == 0 || ! SamePrefix(key, prev)) {
+      Slice prefix = prefix_extractor_->Transform(key);
+      assert(comparator_->Compare(prefix, key) <= 0);
+      InternalKey internal_prefix_tmp(prefix, 0, kTypeValue);
+      Slice internal_prefix = internal_prefix_tmp.Encode();
+      start_.push_back(entries_.size());
+      entries_.append(internal_prefix.data(), internal_prefix.size());
+    }
+  }
 }
 
 Slice FilterBlockBuilder::Finish() {
@@ -50,34 +90,35 @@ Slice FilterBlockBuilder::Finish() {
 }
 
 void FilterBlockBuilder::GenerateFilter() {
-  const size_t num_keys = start_.size();
-  if (num_keys == 0) {
+  const size_t num_entries = start_.size();
+  if (num_entries == 0) {
     // Fast path if there are no keys for this filter
     filter_offsets_.push_back(result_.size());
     return;
   }
 
   // Make list of keys from flattened key structure
-  start_.push_back(keys_.size());  // Simplify length computation
-  tmp_keys_.resize(num_keys);
-  for (size_t i = 0; i < num_keys; i++) {
-    const char* base = keys_.data() + start_[i];
+  start_.push_back(entries_.size());  // Simplify length computation
+  tmp_entries_.resize(num_entries);
+  for (size_t i = 0; i < num_entries; i++) {
+    const char* base = entries_.data() + start_[i];
     size_t length = start_[i+1] - start_[i];
-    tmp_keys_[i] = Slice(base, length);
+    tmp_entries_[i] = Slice(base, length);
   }
 
   // Generate filter for current set of keys and append to result_.
   filter_offsets_.push_back(result_.size());
-  policy_->CreateFilter(&tmp_keys_[0], num_keys, &result_);
+  policy_->CreateFilter(&tmp_entries_[0], num_entries, &result_);
 
-  tmp_keys_.clear();
-  keys_.clear();
+  tmp_entries_.clear();
+  entries_.clear();
   start_.clear();
 }
 
-FilterBlockReader::FilterBlockReader(const FilterPolicy* policy,
-                                     const Slice& contents)
-    : policy_(policy),
+FilterBlockReader::FilterBlockReader(const Options& opt, const Slice& contents)
+    : policy_(opt.filter_policy),
+      prefix_extractor_(opt.prefix_extractor),
+      whole_key_filtering_(opt.whole_key_filtering),
       data_(nullptr),
       offset_(nullptr),
       num_(0),
@@ -92,16 +133,32 @@ FilterBlockReader::FilterBlockReader(const FilterPolicy* policy,
   num_ = (n - 5 - last_word) / 4;
 }
 
-bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) {
+bool FilterBlockReader::KeyMayMatch(uint64_t block_offset,
+                                    const Slice& key) {
+  if (!whole_key_filtering_) {
+    return true;
+  }
+  return MayMatch(block_offset, key);
+}
+
+bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset,
+                                       const Slice& prefix) {
+  if (!prefix_extractor_) {
+    return true;
+  }
+  return MayMatch(block_offset, prefix);
+}
+
+bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) {
   uint64_t index = block_offset >> base_lg_;
   if (index < num_) {
     uint32_t start = DecodeFixed32(offset_ + index*4);
     uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
     if (start <= limit && limit <= (offset_ - data_)) {
       Slice filter = Slice(data_ + start, limit - start);
-      return policy_->KeyMayMatch(key, filter);
+      return policy_->KeyMayMatch(entry, filter);
     } else if (start == limit) {
-      // Empty filters do not match any keys
+      // Empty filters do not match any entries
       return false;
     }
   }
diff --git a/table/filter_block.h b/table/filter_block.h
index c67d010bd..7e6982188 100644
--- a/table/filter_block.h
+++ b/table/filter_block.h
@@ -13,7 +13,9 @@
 #include <stdint.h>
 #include <string>
 #include <vector>
+#include "leveldb/options.h"
 #include "leveldb/slice.h"
+#include "leveldb/slice_transform.h"
 #include "util/hash.h"
 
 namespace leveldb {
@@ -28,20 +30,25 @@ class FilterPolicy;
 //      (StartBlock AddKey*)* Finish
 class FilterBlockBuilder {
  public:
-  explicit FilterBlockBuilder(const FilterPolicy*);
+  explicit FilterBlockBuilder(const Options& opt);
 
   void StartBlock(uint64_t block_offset);
   void AddKey(const Slice& key);
   Slice Finish();
 
  private:
+  bool SamePrefix(const Slice &key1, const Slice &key2) const;
   void GenerateFilter();
 
   const FilterPolicy* policy_;
-  std::string keys_;              // Flattened key contents
-  std::vector<size_t> start_;     // Starting index in keys_ of each key
-  std::string result_;            // Filter data computed so far
-  std::vector<Slice> tmp_keys_;   // policy_->CreateFilter() argument
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+  const Comparator* comparator_;
+
+  std::string entries_;         // Flattened entry contents
+  std::vector<size_t> start_;   // Starting index in entries_ of each entry
+  std::string result_;          // Filter data computed so far
+  std::vector<Slice> tmp_entries_; // policy_->CreateFilter() argument
   std::vector<uint32_t> filter_offsets_;
 
   // No copying allowed
@@ -52,15 +59,20 @@ class FilterBlockBuilder {
 class FilterBlockReader {
  public:
  // REQUIRES: "contents" and *policy must stay live while *this is live.
-  FilterBlockReader(const FilterPolicy* policy, const Slice& contents);
+  FilterBlockReader(const Options& opt, const Slice& contents);
   bool KeyMayMatch(uint64_t block_offset, const Slice& key);
+  bool PrefixMayMatch(uint64_t block_offset, const Slice& prefix);
 
  private:
   const FilterPolicy* policy_;
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
   const char* data_;    // Pointer to filter data (at block-start)
   const char* offset_;  // Pointer to beginning of offset array (at block-end)
   size_t num_;          // Number of entries in offset array
   size_t base_lg_;      // Encoding parameter (see kFilterBaseLg in .cc file)
+
+  bool MayMatch(uint64_t block_offset, const Slice& entry);
 };
 
 }
diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc
index cadb273b0..b61770fdf 100644
--- a/table/filter_block_test.cc
+++ b/table/filter_block_test.cc
@@ -41,19 +41,25 @@ class TestHashFilter : public FilterPolicy {
 class FilterBlockTest {
  public:
   TestHashFilter policy_;
+  Options options_;
+
+  FilterBlockTest() {
+    options_ = Options();
+    options_.filter_policy = &policy_;
+  }
 };
 
 TEST(FilterBlockTest, EmptyBuilder) {
-  FilterBlockBuilder builder(&policy_);
+  FilterBlockBuilder builder(options_);
   Slice block = builder.Finish();
   ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
-  FilterBlockReader reader(&policy_, block);
+  FilterBlockReader reader(options_, block);
   ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
   ASSERT_TRUE(reader.KeyMayMatch(100000, "foo"));
 }
 
 TEST(FilterBlockTest, SingleChunk) {
-  FilterBlockBuilder builder(&policy_);
+  FilterBlockBuilder builder(options_);
   builder.StartBlock(100);
   builder.AddKey("foo");
   builder.AddKey("bar");
@@ -63,7 +69,7 @@ TEST(FilterBlockTest, SingleChunk) {
   builder.StartBlock(300);
   builder.AddKey("hello");
   Slice block = builder.Finish();
-  FilterBlockReader reader(&policy_, block);
+  FilterBlockReader reader(options_, block);
   ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
   ASSERT_TRUE(reader.KeyMayMatch(100, "bar"));
   ASSERT_TRUE(reader.KeyMayMatch(100, "box"));
@@ -74,7 +80,7 @@ TEST(FilterBlockTest, SingleChunk) {
 }
 
 TEST(FilterBlockTest, MultiChunk) {
-  FilterBlockBuilder builder(&policy_);
+  FilterBlockBuilder builder(options_);
 
   // First filter
   builder.StartBlock(0);
@@ -94,7 +100,7 @@ TEST(FilterBlockTest, MultiChunk) {
   builder.AddKey("hello");
 
   Slice block = builder.Finish();
-  FilterBlockReader reader(&policy_, block);
+  FilterBlockReader reader(options_, block);
 
   // Check first filter
   ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
diff --git a/table/table.cc b/table/table.cc
index be0f43c90..9a5f1b761 100644
--- a/table/table.cc
+++ b/table/table.cc
@@ -4,6 +4,8 @@
 
 #include "table/table.h"
 
+#include "db/dbformat.h"
+
 #include "leveldb/cache.h"
 #include "leveldb/comparator.h"
 #include "leveldb/env.h"
@@ -207,7 +209,7 @@ void Table::ReadFilter(const Slice& filter_handle_value) {
   if (block.heap_allocated) {
     rep_->filter_data = block.data.data();     // Will need to delete later
   }
-  rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data);
+  rep_->filter = new FilterBlockReader(rep_->options, block.data);
 }
 
 Table::~Table() {
@@ -318,7 +320,66 @@ Iterator* Table::BlockReader(void* arg,
   return BlockReader(arg, options, index_value, nullptr, for_compaction);
 }
 
+// This will be broken if the user specifies an unusual implementation
+// of Options.comparator, or if the user specifies an unusual
+// definition of prefixes in Options.filter_policy.  In particular, we
+// require the following three properties:
+//
+// 1) key.starts_with(prefix(key))
+// 2) Compare(prefix(key), key) <= 0.
+// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
+bool Table::PrefixMayMatch(const Slice& internal_prefix) const {
+  FilterBlockReader* filter = rep_->filter;
+  bool may_match = true;
+  Status s;
+
+  if (filter == nullptr) {
+    return true;
+  }
+
+  Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
+  iiter->Seek(internal_prefix);
+  if (! iiter->Valid()) {
+    // we're past end of file
+    may_match = false;
+  } else if (iiter->key().starts_with(internal_prefix)) {
+    // we need to check for this subtle case because our only
+    // guarantee is that "the key is a string >= last key in that data
+    // block" according to the doc/table_format.txt spec.
+    //
+    // Suppose iiter->key() starts with the desired prefix; it is not
+    // necessarily the case that the corresponding data block will
+    // contain the prefix, since iiter->key() need not be in the
+    // block.  However, the next data block may contain the prefix, so
+    // we return true to play it safe.
+    may_match = true;
+  } else {
+    // iiter->key() does NOT start with the desired prefix.  Because
+    // Seek() finds the first key that is >= the seek target, this
+    // means that iiter->key() > prefix.  Thus, any data blocks coming
+    // after the data block corresponding to iiter->key() cannot
+    // possibly contain the key.  Thus, the corresponding data block
+    // is the only one which could potentially contain the prefix.
+    Slice handle_value = iiter->value();
+    BlockHandle handle;
+    s = handle.DecodeFrom(&handle_value);
+    assert(s.ok());
+    may_match = filter->PrefixMayMatch(handle.offset(), internal_prefix);
+  }
+  delete iiter;
+  return may_match;
+}
+
 Iterator* Table::NewIterator(const ReadOptions& options) const {
+  if (options.prefix) {
+    InternalKey internal_prefix(*options.prefix, 0, kTypeValue);
+    if (!PrefixMayMatch(internal_prefix.Encode())) {
+      // nothing in this file can match the prefix, so we should not
+      // bother doing I/O to this file when iterating.
+      return NewEmptyIterator();
+    }
+  }
+
   return NewTwoLevelIterator(
       rep_->index_block->NewIterator(rep_->options.comparator),
       &Table::BlockReader, const_cast<Table*>(this), options, rep_->soptions);
diff --git a/table/table.h b/table/table.h
index b39a5c186..c2cb5a32b 100644
--- a/table/table.h
+++ b/table/table.h
@@ -47,6 +47,8 @@ class Table {
 
   ~Table();
 
+  bool PrefixMayMatch(const Slice& prefix) const;
+
   // Returns a new iterator over the table contents.
   // The result of NewIterator() is initially invalid (caller must
   // call one of the Seek methods on the iterator before using it).
diff --git a/table/table_builder.cc b/table/table_builder.cc
index 078ffedf2..4e6b047e6 100644
--- a/table/table_builder.cc
+++ b/table/table_builder.cc
@@ -55,7 +55,7 @@ struct TableBuilder::Rep {
         num_entries(0),
         closed(false),
         filter_block(opt.filter_policy == nullptr ? nullptr
-                     : new FilterBlockBuilder(opt.filter_policy)),
+                     : new FilterBlockBuilder(opt)),
         pending_index_entry(false) {
     index_block_options.block_restart_interval = 1;
   }
diff --git a/util/fixed_prefix_transform.cc b/util/fixed_prefix_transform.cc
new file mode 100644
index 000000000..2eadb8b0a
--- /dev/null
+++ b/util/fixed_prefix_transform.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/slice_transform.h"
+
+#include "leveldb/slice.h"
+
+namespace leveldb {
+
+namespace {
+
+class FixedPrefixTransform : public SliceTransform {
+ private:
+  size_t prefix_len_;
+
+ public:
+  explicit FixedPrefixTransform(size_t prefix_len) : prefix_len_(prefix_len) { }
+
+  virtual const char* Name() const {
+    return "rocksdb.FixedPrefix";
+  }
+
+  virtual Slice Transform(const Slice& src) const {
+    assert(InDomain(src));
+    return Slice(src.data(), prefix_len_);
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    return (src.size() >= prefix_len_);
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    return (dst.size() == prefix_len_);
+  }
+};
+}
+
+const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) {
+  return new FixedPrefixTransform(prefix_len);
+}
+
+}  // namespace leveldb
diff --git a/util/options.cc b/util/options.cc
index cc5653326..56bc0ca73 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -33,6 +33,8 @@ Options::Options()
       block_restart_interval(16),
       compression(kSnappyCompression),
       filter_policy(nullptr),
+      prefix_extractor(nullptr),
+      whole_key_filtering(true),
       num_levels(7),
       level0_file_num_compaction_trigger(4),
       level0_slowdown_writes_trigger(8),