diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc index 4b8132df3..e58e6cd82 100644 --- a/db/db_with_timestamp_basic_test.cc +++ b/db/db_with_timestamp_basic_test.cc @@ -3918,6 +3918,94 @@ TEST_F(DBBasicTestWithTimestamp, IterSeekToLastWithIterateUpperbound) { ASSERT_FALSE(iter->Valid()); ASSERT_OK(iter->status()); } + +TEST_F(DBBasicTestWithTimestamp, TimestampFilterTableReadOnGet) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + BlockBasedTableOptions bbto; + bbto.block_size = 100; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + // Put + // Create two SST files + // file1: key => [1, 3], timestamp => [10, 20] + // file2, key => [2, 4], timestamp => [30, 40] + { + WriteOptions write_opts; + std::string write_ts = Timestamp(10, 0); + ASSERT_OK(db_->Put(write_opts, Key1(1), write_ts, "value1")); + write_ts = Timestamp(20, 0); + ASSERT_OK(db_->Put(write_opts, Key1(3), write_ts, "value3")); + ASSERT_OK(Flush()); + + write_ts = Timestamp(30, 0); + ASSERT_OK(db_->Put(write_opts, Key1(2), write_ts, "value2")); + write_ts = Timestamp(40, 0); + ASSERT_OK(db_->Put(write_opts, Key1(4), write_ts, "value4")); + ASSERT_OK(Flush()); + } + + // Get with timestamp + { + auto prev_checked_events = options.statistics->getTickerCount( + Tickers::TIMESTAMP_FILTER_TABLE_CHECKED); + auto prev_filtered_events = options.statistics->getTickerCount( + Tickers::TIMESTAMP_FILTER_TABLE_FILTERED); + + // key=3 (ts=20) does not exist at timestamp=1 + std::string read_ts_str = Timestamp(1, 0); + Slice read_ts_slice = Slice(read_ts_str); + ReadOptions read_opts; + read_opts.timestamp = &read_ts_slice; + std::string value_from_get = ""; + std::string timestamp_from_get = ""; + auto status = + db_->Get(read_opts, Key1(3), &value_from_get, ×tamp_from_get); + ASSERT_TRUE(status.IsNotFound()); + ASSERT_EQ(value_from_get, std::string("")); + ASSERT_EQ(timestamp_from_get, std::string("")); + + // key=3 is in the key ranges for both files, so both files will be queried. + // The table read was skipped because the timestamp is out of the table + // range, i.e.., 1 < [10,20], [30,40]. + // The tickers increase by 2 due to 2 files. + ASSERT_EQ(prev_checked_events + 2, + options.statistics->getTickerCount( + Tickers::TIMESTAMP_FILTER_TABLE_CHECKED)); + ASSERT_EQ(prev_filtered_events + 2, + options.statistics->getTickerCount( + Tickers::TIMESTAMP_FILTER_TABLE_FILTERED)); + + // key=3 (ts=20) exists at timestamp = 25 + read_ts_str = Timestamp(25, 0); + read_ts_slice = Slice(read_ts_str); + read_opts.timestamp = &read_ts_slice; + ASSERT_OK( + db_->Get(read_opts, Key1(3), &value_from_get, ×tamp_from_get)); + ASSERT_EQ("value3", value_from_get); + ASSERT_EQ(Timestamp(20, 0), timestamp_from_get); + + // file1 was not skipped, because the timestamp is in range, [10,20] < 25. + // file2 was skipped, because the timestamp is not in range, 25 < [30,40]. + // So the checked ticker increase by 2 due to 2 files; + // filtered ticker increase by 1 because file2 was skipped + ASSERT_EQ(prev_checked_events + 4, + options.statistics->getTickerCount( + Tickers::TIMESTAMP_FILTER_TABLE_CHECKED)); + ASSERT_EQ(prev_filtered_events + 3, + options.statistics->getTickerCount( + Tickers::TIMESTAMP_FILTER_TABLE_FILTERED)); + } + + Close(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index c10c67919..f25e02ebe 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -429,6 +429,12 @@ enum Tickers : uint32_t { // that finds its data for table open TABLE_OPEN_PREFETCH_TAIL_HIT, + // Statistics on the filtering by user-defined timestamps + // # of times timestamps are checked on accessing the table + TIMESTAMP_FILTER_TABLE_CHECKED, + // # of times timestamps can successfully help skip the table access + TIMESTAMP_FILTER_TABLE_FILTERED, + TICKER_ENUM_MAX }; diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 206372c7c..05de681a2 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -219,6 +219,9 @@ const std::vector> TickersNameMap = { {SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"}, {TABLE_OPEN_PREFETCH_TAIL_MISS, "rocksdb.table.open.prefetch.tail.miss"}, {TABLE_OPEN_PREFETCH_TAIL_HIT, "rocksdb.table.open.prefetch.tail.hit"}, + {TIMESTAMP_FILTER_TABLE_CHECKED, "rocksdb.timestamp.filter.table.checked"}, + {TIMESTAMP_FILTER_TABLE_FILTERED, + "rocksdb.timestamp.filter.table.filtered"}, }; const std::vector> HistogramsNameMap = { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 55253f11e..687b56c3c 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -916,10 +916,18 @@ Status BlockBasedTable::ReadPropertiesBlock( // If table properties don't contain index type, we assume that the table // is in very old format and has kBinarySearch index type. auto& props = rep_->table_properties->user_collected_properties; - auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); - if (pos != props.end()) { + auto index_type_pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (index_type_pos != props.end()) { rep_->index_type = static_cast( - DecodeFixed32(pos->second.c_str())); + DecodeFixed32(index_type_pos->second.c_str())); + } + auto min_ts_pos = props.find("rocksdb.timestamp_min"); + if (min_ts_pos != props.end()) { + rep_->min_timestamp = Slice(min_ts_pos->second); + } + auto max_ts_pos = props.find("rocksdb.timestamp_max"); + if (max_ts_pos != props.end()) { + rep_->max_timestamp = Slice(max_ts_pos->second); } rep_->index_has_first_key = @@ -1983,10 +1991,28 @@ Status BlockBasedTable::ApproximateKeyAnchors(const ReadOptions& read_options, return Status::OK(); } +bool BlockBasedTable::TimestampMayMatch(const ReadOptions& read_options) const { + if (read_options.timestamp != nullptr && !rep_->min_timestamp.empty()) { + RecordTick(rep_->ioptions.stats, TIMESTAMP_FILTER_TABLE_CHECKED); + auto read_ts = read_options.timestamp; + auto comparator = rep_->internal_comparator.user_comparator(); + if (comparator->CompareTimestamp(*read_ts, rep_->min_timestamp) < 0) { + RecordTick(rep_->ioptions.stats, TIMESTAMP_FILTER_TABLE_FILTERED); + return false; + } + } + return true; +} + Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters) { + // Similar to Bloom filter !may_match + // If timestamp is beyond the range of the table, skip + if (!TimestampMayMatch(read_options)) { + return Status::OK(); + } assert(key.size() >= 8); // key must be internal key assert(get_context != nullptr); Status s; diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index ec9473b00..3df5cb6cb 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -494,6 +494,8 @@ class BlockBasedTable : public TableReader { // in building the table file, otherwise true. bool PrefixExtractorChanged(const SliceTransform* prefix_extractor) const; + bool TimestampMayMatch(const ReadOptions& read_options) const; + // A cumulative data block file read in MultiGet lower than this size will // use a stack buffer static constexpr size_t kMultiGetReadStackBufSize = 8192; @@ -595,6 +597,12 @@ struct BlockBasedTable::Rep { // move is involved int level; + // the timestamp range of table + // Points into memory owned by TableProperties. This would need to change if + // TableProperties become subject to cache eviction. + Slice min_timestamp; + Slice max_timestamp; + // If false, blocks in this file are definitely all uncompressed. Knowing this // before reading individual blocks enables certain optimizations. bool blocks_maybe_compressed = true;