Filter table files by timestamp: Get operator (#11332)

Summary:
If RocksDB enables user-defined timestamp, then RocksDB read path can filter table files by the min/max timestamps of each file. If application wants to lookup a key that is the most recent and visible to a certain timestamp ts, then we can compare ts with the min_ts of each file. If ts < min_ts, then we know all keys in the file is not visible at time ts, then we do not have to open the file. This can also save an IO.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11332

Reviewed By: pdillinger

Differential Revision: D44763497

Pulled By: guowentian

fbshipit-source-id: abde346b9f18480fe03c04e4006e7d62aa9c22a8
oxigraph-8.3.2
Wentian Guo 2 years ago committed by Facebook GitHub Bot
parent b3c43a5b99
commit 0578d9f951
  1. 88
      db/db_with_timestamp_basic_test.cc
  2. 6
      include/rocksdb/statistics.h
  3. 3
      monitoring/statistics.cc
  4. 32
      table/block_based/block_based_table_reader.cc
  5. 8
      table/block_based/block_based_table_reader.h

@ -3918,6 +3918,94 @@ TEST_F(DBBasicTestWithTimestamp, IterSeekToLastWithIterateUpperbound) {
ASSERT_FALSE(iter->Valid()); ASSERT_FALSE(iter->Valid());
ASSERT_OK(iter->status()); ASSERT_OK(iter->status());
} }
TEST_F(DBBasicTestWithTimestamp, TimestampFilterTableReadOnGet) {
Options options = CurrentOptions();
options.env = env_;
options.create_if_missing = true;
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp;
BlockBasedTableOptions bbto;
bbto.block_size = 100;
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
DestroyAndReopen(options);
// Put
// Create two SST files
// file1: key => [1, 3], timestamp => [10, 20]
// file2, key => [2, 4], timestamp => [30, 40]
{
WriteOptions write_opts;
std::string write_ts = Timestamp(10, 0);
ASSERT_OK(db_->Put(write_opts, Key1(1), write_ts, "value1"));
write_ts = Timestamp(20, 0);
ASSERT_OK(db_->Put(write_opts, Key1(3), write_ts, "value3"));
ASSERT_OK(Flush());
write_ts = Timestamp(30, 0);
ASSERT_OK(db_->Put(write_opts, Key1(2), write_ts, "value2"));
write_ts = Timestamp(40, 0);
ASSERT_OK(db_->Put(write_opts, Key1(4), write_ts, "value4"));
ASSERT_OK(Flush());
}
// Get with timestamp
{
auto prev_checked_events = options.statistics->getTickerCount(
Tickers::TIMESTAMP_FILTER_TABLE_CHECKED);
auto prev_filtered_events = options.statistics->getTickerCount(
Tickers::TIMESTAMP_FILTER_TABLE_FILTERED);
// key=3 (ts=20) does not exist at timestamp=1
std::string read_ts_str = Timestamp(1, 0);
Slice read_ts_slice = Slice(read_ts_str);
ReadOptions read_opts;
read_opts.timestamp = &read_ts_slice;
std::string value_from_get = "";
std::string timestamp_from_get = "";
auto status =
db_->Get(read_opts, Key1(3), &value_from_get, &timestamp_from_get);
ASSERT_TRUE(status.IsNotFound());
ASSERT_EQ(value_from_get, std::string(""));
ASSERT_EQ(timestamp_from_get, std::string(""));
// key=3 is in the key ranges for both files, so both files will be queried.
// The table read was skipped because the timestamp is out of the table
// range, i.e.., 1 < [10,20], [30,40].
// The tickers increase by 2 due to 2 files.
ASSERT_EQ(prev_checked_events + 2,
options.statistics->getTickerCount(
Tickers::TIMESTAMP_FILTER_TABLE_CHECKED));
ASSERT_EQ(prev_filtered_events + 2,
options.statistics->getTickerCount(
Tickers::TIMESTAMP_FILTER_TABLE_FILTERED));
// key=3 (ts=20) exists at timestamp = 25
read_ts_str = Timestamp(25, 0);
read_ts_slice = Slice(read_ts_str);
read_opts.timestamp = &read_ts_slice;
ASSERT_OK(
db_->Get(read_opts, Key1(3), &value_from_get, &timestamp_from_get));
ASSERT_EQ("value3", value_from_get);
ASSERT_EQ(Timestamp(20, 0), timestamp_from_get);
// file1 was not skipped, because the timestamp is in range, [10,20] < 25.
// file2 was skipped, because the timestamp is not in range, 25 < [30,40].
// So the checked ticker increase by 2 due to 2 files;
// filtered ticker increase by 1 because file2 was skipped
ASSERT_EQ(prev_checked_events + 4,
options.statistics->getTickerCount(
Tickers::TIMESTAMP_FILTER_TABLE_CHECKED));
ASSERT_EQ(prev_filtered_events + 3,
options.statistics->getTickerCount(
Tickers::TIMESTAMP_FILTER_TABLE_FILTERED));
}
Close();
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -429,6 +429,12 @@ enum Tickers : uint32_t {
// that finds its data for table open // that finds its data for table open
TABLE_OPEN_PREFETCH_TAIL_HIT, TABLE_OPEN_PREFETCH_TAIL_HIT,
// Statistics on the filtering by user-defined timestamps
// # of times timestamps are checked on accessing the table
TIMESTAMP_FILTER_TABLE_CHECKED,
// # of times timestamps can successfully help skip the table access
TIMESTAMP_FILTER_TABLE_FILTERED,
TICKER_ENUM_MAX TICKER_ENUM_MAX
}; };

@ -219,6 +219,9 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"}, {SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"},
{TABLE_OPEN_PREFETCH_TAIL_MISS, "rocksdb.table.open.prefetch.tail.miss"}, {TABLE_OPEN_PREFETCH_TAIL_MISS, "rocksdb.table.open.prefetch.tail.miss"},
{TABLE_OPEN_PREFETCH_TAIL_HIT, "rocksdb.table.open.prefetch.tail.hit"}, {TABLE_OPEN_PREFETCH_TAIL_HIT, "rocksdb.table.open.prefetch.tail.hit"},
{TIMESTAMP_FILTER_TABLE_CHECKED, "rocksdb.timestamp.filter.table.checked"},
{TIMESTAMP_FILTER_TABLE_FILTERED,
"rocksdb.timestamp.filter.table.filtered"},
}; };
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = { const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {

@ -916,10 +916,18 @@ Status BlockBasedTable::ReadPropertiesBlock(
// If table properties don't contain index type, we assume that the table // If table properties don't contain index type, we assume that the table
// is in very old format and has kBinarySearch index type. // is in very old format and has kBinarySearch index type.
auto& props = rep_->table_properties->user_collected_properties; auto& props = rep_->table_properties->user_collected_properties;
auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); auto index_type_pos = props.find(BlockBasedTablePropertyNames::kIndexType);
if (pos != props.end()) { if (index_type_pos != props.end()) {
rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>( rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
DecodeFixed32(pos->second.c_str())); DecodeFixed32(index_type_pos->second.c_str()));
}
auto min_ts_pos = props.find("rocksdb.timestamp_min");
if (min_ts_pos != props.end()) {
rep_->min_timestamp = Slice(min_ts_pos->second);
}
auto max_ts_pos = props.find("rocksdb.timestamp_max");
if (max_ts_pos != props.end()) {
rep_->max_timestamp = Slice(max_ts_pos->second);
} }
rep_->index_has_first_key = rep_->index_has_first_key =
@ -1983,10 +1991,28 @@ Status BlockBasedTable::ApproximateKeyAnchors(const ReadOptions& read_options,
return Status::OK(); return Status::OK();
} }
bool BlockBasedTable::TimestampMayMatch(const ReadOptions& read_options) const {
if (read_options.timestamp != nullptr && !rep_->min_timestamp.empty()) {
RecordTick(rep_->ioptions.stats, TIMESTAMP_FILTER_TABLE_CHECKED);
auto read_ts = read_options.timestamp;
auto comparator = rep_->internal_comparator.user_comparator();
if (comparator->CompareTimestamp(*read_ts, rep_->min_timestamp) < 0) {
RecordTick(rep_->ioptions.stats, TIMESTAMP_FILTER_TABLE_FILTERED);
return false;
}
}
return true;
}
Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
GetContext* get_context, GetContext* get_context,
const SliceTransform* prefix_extractor, const SliceTransform* prefix_extractor,
bool skip_filters) { bool skip_filters) {
// Similar to Bloom filter !may_match
// If timestamp is beyond the range of the table, skip
if (!TimestampMayMatch(read_options)) {
return Status::OK();
}
assert(key.size() >= 8); // key must be internal key assert(key.size() >= 8); // key must be internal key
assert(get_context != nullptr); assert(get_context != nullptr);
Status s; Status s;

@ -494,6 +494,8 @@ class BlockBasedTable : public TableReader {
// in building the table file, otherwise true. // in building the table file, otherwise true.
bool PrefixExtractorChanged(const SliceTransform* prefix_extractor) const; bool PrefixExtractorChanged(const SliceTransform* prefix_extractor) const;
bool TimestampMayMatch(const ReadOptions& read_options) const;
// A cumulative data block file read in MultiGet lower than this size will // A cumulative data block file read in MultiGet lower than this size will
// use a stack buffer // use a stack buffer
static constexpr size_t kMultiGetReadStackBufSize = 8192; static constexpr size_t kMultiGetReadStackBufSize = 8192;
@ -595,6 +597,12 @@ struct BlockBasedTable::Rep {
// move is involved // move is involved
int level; int level;
// the timestamp range of table
// Points into memory owned by TableProperties. This would need to change if
// TableProperties become subject to cache eviction.
Slice min_timestamp;
Slice max_timestamp;
// If false, blocks in this file are definitely all uncompressed. Knowing this // If false, blocks in this file are definitely all uncompressed. Knowing this
// before reading individual blocks enables certain optimizations. // before reading individual blocks enables certain optimizations.
bool blocks_maybe_compressed = true; bool blocks_maybe_compressed = true;

Loading…
Cancel
Save