expose a hook to skip tables during iteration

Summary:
As discussed on the mailing list (["Skipping entire SSTs while iterating"](https://groups.google.com/forum/#!topic/rocksdb/ujHCJVLrHlU)), this patch adds a `table_filter` to `ReadOptions` that allows specifying a callback to be executed during iteration before each table in the database is scanned. The callback is passed the table's properties; the table is scanned iff the callback returns true.

This can be used in conjunction with a `TablePropertiesCollector` to dramatically speed up scans by skipping tables that are known to contain irrelevant data for the scan at hand.

We're using this [downstream in CockroachDB](https://github.com/cockroachdb/cockroach/blob/master/pkg/storage/engine/db.cc#L2009-L2022) already. With this feature, under ideal conditions, we can reduce the time of an incremental backup in  from hours to seconds.

FYI, the first commit in this PR fixes a segfault that I unfortunately have not figured out how to reproduce outside of CockroachDB. I'm hoping you accept it on the grounds that it is not correct to return 8-byte aligned memory from a call to `malloc` on some 64-bit platforms; one correct approach is to infer the necessary alignment from `std::max_align_t`, as done here. As noted in the first commit message, the bug is tickled by having a`std::function` in `struct ReadOptions`. That is, the following patch alone is enough to cause RocksDB to segfault when run from CockroachDB on Darwin.

```diff
 --- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1546,6 +1546,13 @@ struct ReadOptions {
   // Default: false
   bool ignore_range_deletions;

+  // A callback to determine whether relevant keys for this scan exist in a
+  // given table based on the table's properties. The callback is passed the
+  // properties of each table during iteration. If the callback returns false,
+  // the table will not be scanned.
+  // Default: empty (every table will be scanned)
+  std::function<bool(const TableProperties&)> table_filter;
+
   ReadOptions();
   ReadOptions(bool cksum, bool cache);
 };
```

/cc danhhz
Closes https://github.com/facebook/rocksdb/pull/2265

Differential Revision: D5054262

Pulled By: yiwu-arbug

fbshipit-source-id: dd6b28f2bba6cb8466250d8c5c542d3c92785476
main
Nikhil Benesch 7 years ago committed by Facebook Github Bot
parent eaaef91178
commit 7891af8b53
  1. 65
      db/db_iterator_test.cc
  2. 5
      db/table_cache.cc
  3. 8
      include/rocksdb/options.h

@ -2010,6 +2010,71 @@ TEST_F(DBIteratorTest, CreationFailure) {
delete iter; delete iter;
} }
TEST_F(DBIteratorTest, TableFilter) {
ASSERT_OK(Put("a", "1"));
dbfull()->Flush(FlushOptions());
ASSERT_OK(Put("b", "2"));
ASSERT_OK(Put("c", "3"));
dbfull()->Flush(FlushOptions());
ASSERT_OK(Put("d", "4"));
ASSERT_OK(Put("e", "5"));
ASSERT_OK(Put("f", "6"));
dbfull()->Flush(FlushOptions());
// Ensure the table_filter callback is called once for each table.
{
std::set<uint64_t> unseen {1, 2, 3};
ReadOptions opts;
opts.table_filter = [&](const TableProperties& props) {
auto it = unseen.find(props.num_entries);
if (it == unseen.end()) {
ADD_FAILURE() << "saw table properties with an unexpected " << props.num_entries << " entries";
} else {
unseen.erase(it);
}
return true;
};
auto iter = db_->NewIterator(opts);
iter->SeekToFirst();
ASSERT_EQ(IterStatus(iter), "a->1");
iter->Next();
ASSERT_EQ(IterStatus(iter), "b->2");
iter->Next();
ASSERT_EQ(IterStatus(iter), "c->3");
iter->Next();
ASSERT_EQ(IterStatus(iter), "d->4");
iter->Next();
ASSERT_EQ(IterStatus(iter), "e->5");
iter->Next();
ASSERT_EQ(IterStatus(iter), "f->6");
iter->Next();
ASSERT_FALSE(iter->Valid());
ASSERT_TRUE(unseen.empty());
delete iter;
}
// Ensure returning false in the table_filter hides the keys from that table
// during iteration.
{
ReadOptions opts;
opts.table_filter = [](const TableProperties& props) {
return props.num_entries != 2;
};
auto iter = db_->NewIterator(opts);
iter->SeekToFirst();
ASSERT_EQ(IterStatus(iter), "a->1");
iter->Next();
ASSERT_EQ(IterStatus(iter), "d->4");
iter->Next();
ASSERT_EQ(IterStatus(iter), "e->5");
iter->Next();
ASSERT_EQ(IterStatus(iter), "f->6");
iter->Next();
ASSERT_FALSE(iter->Valid());
delete iter;
}
}
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -225,7 +225,12 @@ InternalIterator* TableCache::NewIterator(
} }
InternalIterator* result = nullptr; InternalIterator* result = nullptr;
if (s.ok()) { if (s.ok()) {
if (options.table_filter &&
!options.table_filter(*table_reader->GetTableProperties())) {
result = NewEmptyInternalIterator(arena);
} else {
result = table_reader->NewIterator(options, arena, skip_filters); result = table_reader->NewIterator(options, arena, skip_filters);
}
if (create_new_table_reader) { if (create_new_table_reader) {
assert(handle == nullptr); assert(handle == nullptr);
result->RegisterCleanup(&DeleteTableReader, table_reader, nullptr); result->RegisterCleanup(&DeleteTableReader, table_reader, nullptr);

@ -1063,6 +1063,14 @@ struct ReadOptions {
// Default: false // Default: false
bool ignore_range_deletions; bool ignore_range_deletions;
// A callback to determine whether relevant keys for this scan exist in a
// given table based on the table's properties. The callback is passed the
// properties of each table during iteration. If the callback returns false,
// the table will not be scanned. This option only affects Iterators and has
// no impact on point lookups.
// Default: empty (every table will be scanned)
std::function<bool(const TableProperties&)> table_filter;
ReadOptions(); ReadOptions();
ReadOptions(bool cksum, bool cache); ReadOptions(bool cksum, bool cache);
}; };

Loading…
Cancel
Save