From 0578d9f9515b1e39e0b6483ca658a16bcdd8bc9e Mon Sep 17 00:00:00 2001
From: Wentian Guo <wentianguo@meta.com>
Date: Thu, 6 Apr 2023 15:39:38 -0700
Subject: [PATCH] Filter table files by timestamp: Get operator (#11332)

Summary:
If RocksDB enables user-defined timestamp, then RocksDB read path can filter table files by the min/max timestamps of each file. If application wants to lookup a key that is the most recent and visible to a certain timestamp ts, then we can compare ts with the min_ts of each file. If ts < min_ts, then we know all keys in the file is not visible at time ts, then we do not have to open the file. This can also save an IO.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11332

Reviewed By: pdillinger

Differential Revision: D44763497

Pulled By: guowentian

fbshipit-source-id: abde346b9f18480fe03c04e4006e7d62aa9c22a8
---
 db/db_with_timestamp_basic_test.cc            | 88 +++++++++++++++++++
 include/rocksdb/statistics.h                  |  6 ++
 monitoring/statistics.cc                      |  3 +
 table/block_based/block_based_table_reader.cc | 32 ++++++-
 table/block_based/block_based_table_reader.h  |  8 ++
 5 files changed, 134 insertions(+), 3 deletions(-)

diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc
index 4b8132df3..e58e6cd82 100644
--- a/db/db_with_timestamp_basic_test.cc
+++ b/db/db_with_timestamp_basic_test.cc
@@ -3918,6 +3918,94 @@ TEST_F(DBBasicTestWithTimestamp, IterSeekToLastWithIterateUpperbound) {
   ASSERT_FALSE(iter->Valid());
   ASSERT_OK(iter->status());
 }
+
+TEST_F(DBBasicTestWithTimestamp, TimestampFilterTableReadOnGet) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.block_size = 100;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Put
+  // Create two SST files
+  // file1: key => [1, 3], timestamp => [10, 20]
+  // file2, key => [2, 4], timestamp => [30, 40]
+  {
+    WriteOptions write_opts;
+    std::string write_ts = Timestamp(10, 0);
+    ASSERT_OK(db_->Put(write_opts, Key1(1), write_ts, "value1"));
+    write_ts = Timestamp(20, 0);
+    ASSERT_OK(db_->Put(write_opts, Key1(3), write_ts, "value3"));
+    ASSERT_OK(Flush());
+
+    write_ts = Timestamp(30, 0);
+    ASSERT_OK(db_->Put(write_opts, Key1(2), write_ts, "value2"));
+    write_ts = Timestamp(40, 0);
+    ASSERT_OK(db_->Put(write_opts, Key1(4), write_ts, "value4"));
+    ASSERT_OK(Flush());
+  }
+
+  // Get with timestamp
+  {
+    auto prev_checked_events = options.statistics->getTickerCount(
+        Tickers::TIMESTAMP_FILTER_TABLE_CHECKED);
+    auto prev_filtered_events = options.statistics->getTickerCount(
+        Tickers::TIMESTAMP_FILTER_TABLE_FILTERED);
+
+    // key=3 (ts=20) does not exist at timestamp=1
+    std::string read_ts_str = Timestamp(1, 0);
+    Slice read_ts_slice = Slice(read_ts_str);
+    ReadOptions read_opts;
+    read_opts.timestamp = &read_ts_slice;
+    std::string value_from_get = "";
+    std::string timestamp_from_get = "";
+    auto status =
+        db_->Get(read_opts, Key1(3), &value_from_get, &timestamp_from_get);
+    ASSERT_TRUE(status.IsNotFound());
+    ASSERT_EQ(value_from_get, std::string(""));
+    ASSERT_EQ(timestamp_from_get, std::string(""));
+
+    // key=3 is in the key ranges for both files, so both files will be queried.
+    // The table read was skipped because the timestamp is out of the table
+    // range, i.e.., 1 < [10,20], [30,40].
+    // The tickers increase by 2 due to 2 files.
+    ASSERT_EQ(prev_checked_events + 2,
+              options.statistics->getTickerCount(
+                  Tickers::TIMESTAMP_FILTER_TABLE_CHECKED));
+    ASSERT_EQ(prev_filtered_events + 2,
+              options.statistics->getTickerCount(
+                  Tickers::TIMESTAMP_FILTER_TABLE_FILTERED));
+
+    // key=3 (ts=20) exists at timestamp = 25
+    read_ts_str = Timestamp(25, 0);
+    read_ts_slice = Slice(read_ts_str);
+    read_opts.timestamp = &read_ts_slice;
+    ASSERT_OK(
+        db_->Get(read_opts, Key1(3), &value_from_get, &timestamp_from_get));
+    ASSERT_EQ("value3", value_from_get);
+    ASSERT_EQ(Timestamp(20, 0), timestamp_from_get);
+
+    // file1 was not skipped, because the timestamp is in range, [10,20] < 25.
+    // file2 was skipped, because the timestamp is not in range, 25 < [30,40].
+    // So the checked ticker increase by 2 due to 2 files;
+    // filtered ticker increase by 1 because file2 was skipped
+    ASSERT_EQ(prev_checked_events + 4,
+              options.statistics->getTickerCount(
+                  Tickers::TIMESTAMP_FILTER_TABLE_CHECKED));
+    ASSERT_EQ(prev_filtered_events + 3,
+              options.statistics->getTickerCount(
+                  Tickers::TIMESTAMP_FILTER_TABLE_FILTERED));
+  }
+
+  Close();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index c10c67919..f25e02ebe 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -429,6 +429,12 @@ enum Tickers : uint32_t {
   // that finds its data for table open
   TABLE_OPEN_PREFETCH_TAIL_HIT,
 
+  // Statistics on the filtering by user-defined timestamps
+  // # of times timestamps are checked on accessing the table
+  TIMESTAMP_FILTER_TABLE_CHECKED,
+  // # of times timestamps can successfully help skip the table access
+  TIMESTAMP_FILTER_TABLE_FILTERED,
+
   TICKER_ENUM_MAX
 };
 
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index 206372c7c..05de681a2 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -219,6 +219,9 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"},
     {TABLE_OPEN_PREFETCH_TAIL_MISS, "rocksdb.table.open.prefetch.tail.miss"},
     {TABLE_OPEN_PREFETCH_TAIL_HIT, "rocksdb.table.open.prefetch.tail.hit"},
+    {TIMESTAMP_FILTER_TABLE_CHECKED, "rocksdb.timestamp.filter.table.checked"},
+    {TIMESTAMP_FILTER_TABLE_FILTERED,
+     "rocksdb.timestamp.filter.table.filtered"},
 };
 
 const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 55253f11e..687b56c3c 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -916,10 +916,18 @@ Status BlockBasedTable::ReadPropertiesBlock(
     // If table properties don't contain index type, we assume that the table
     // is in very old format and has kBinarySearch index type.
     auto& props = rep_->table_properties->user_collected_properties;
-    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
-    if (pos != props.end()) {
+    auto index_type_pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+    if (index_type_pos != props.end()) {
       rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
-          DecodeFixed32(pos->second.c_str()));
+          DecodeFixed32(index_type_pos->second.c_str()));
+    }
+    auto min_ts_pos = props.find("rocksdb.timestamp_min");
+    if (min_ts_pos != props.end()) {
+      rep_->min_timestamp = Slice(min_ts_pos->second);
+    }
+    auto max_ts_pos = props.find("rocksdb.timestamp_max");
+    if (max_ts_pos != props.end()) {
+      rep_->max_timestamp = Slice(max_ts_pos->second);
     }
 
     rep_->index_has_first_key =
@@ -1983,10 +1991,28 @@ Status BlockBasedTable::ApproximateKeyAnchors(const ReadOptions& read_options,
   return Status::OK();
 }
 
+bool BlockBasedTable::TimestampMayMatch(const ReadOptions& read_options) const {
+  if (read_options.timestamp != nullptr && !rep_->min_timestamp.empty()) {
+    RecordTick(rep_->ioptions.stats, TIMESTAMP_FILTER_TABLE_CHECKED);
+    auto read_ts = read_options.timestamp;
+    auto comparator = rep_->internal_comparator.user_comparator();
+    if (comparator->CompareTimestamp(*read_ts, rep_->min_timestamp) < 0) {
+      RecordTick(rep_->ioptions.stats, TIMESTAMP_FILTER_TABLE_FILTERED);
+      return false;
+    }
+  }
+  return true;
+}
+
 Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
                             GetContext* get_context,
                             const SliceTransform* prefix_extractor,
                             bool skip_filters) {
+  // Similar to Bloom filter !may_match
+  // If timestamp is beyond the range of the table, skip
+  if (!TimestampMayMatch(read_options)) {
+    return Status::OK();
+  }
   assert(key.size() >= 8);  // key must be internal key
   assert(get_context != nullptr);
   Status s;
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index ec9473b00..3df5cb6cb 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -494,6 +494,8 @@ class BlockBasedTable : public TableReader {
   // in building the table file, otherwise true.
   bool PrefixExtractorChanged(const SliceTransform* prefix_extractor) const;
 
+  bool TimestampMayMatch(const ReadOptions& read_options) const;
+
   // A cumulative data block file read in MultiGet lower than this size will
   // use a stack buffer
   static constexpr size_t kMultiGetReadStackBufSize = 8192;
@@ -595,6 +597,12 @@ struct BlockBasedTable::Rep {
   // move is involved
   int level;
 
+  // the timestamp range of table
+  // Points into memory owned by TableProperties. This would need to change if
+  // TableProperties become subject to cache eviction.
+  Slice min_timestamp;
+  Slice max_timestamp;
+
   // If false, blocks in this file are definitely all uncompressed. Knowing this
   // before reading individual blocks enables certain optimizations.
   bool blocks_maybe_compressed = true;