ReadOptions.iter_start_ts should support tombstones (#7178)

Summary:
as title.
When ReadOptions.iter_start_ts is not nullptr, DBIter::key() should
return internal keys including value type.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/7178

Test Plan: make check

Reviewed By: ltamasi

Differential Revision: D22935879

Pulled By: riversand963

fbshipit-source-id: 7508d962cf11ebcfa6386d2529b4f3606b47ccfd
main
Yanqin Jin 4 years ago committed by Facebook GitHub Bot
parent 493f425e77
commit 2735b0275d
  1. 30
      db/db_iter.cc
  2. 2
      db/db_iter.h
  3. 144
      db/db_with_timestamp_basic_test.cc

@ -246,11 +246,10 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
}
assert(ikey_.user_key.size() >= timestamp_size_);
Slice ts;
Slice ts = timestamp_size_ > 0 ? ExtractTimestampFromUserKey(
ikey_.user_key, timestamp_size_)
: Slice();
bool more_recent = false;
if (timestamp_size_ > 0) {
ts = ExtractTimestampFromUserKey(ikey_.user_key, timestamp_size_);
}
if (IsVisible(ikey_.sequence, ts, &more_recent)) {
// If the previous entry is of seqnum 0, the current entry will not
// possibly be skipped. This condition can potentially be relaxed to
@ -285,7 +284,20 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
// 2) return ikey only if ikey.seqnum >= start_seqnum_
// note that if deletion seqnum is < start_seqnum_ we
// just skip it like in normal iterator.
if (start_seqnum_ > 0 && ikey_.sequence >= start_seqnum_) {
if (start_seqnum_ > 0) {
if (ikey_.sequence >= start_seqnum_) {
saved_key_.SetInternalKey(ikey_);
valid_ = true;
return true;
} else {
saved_key_.SetUserKey(
ikey_.user_key,
!pin_thru_lifetime_ ||
!iter_.iter()->IsKeyPinned() /* copy */);
skipping_saved_key = true;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
}
} else if (timestamp_lb_) {
saved_key_.SetInternalKey(ikey_);
valid_ = true;
return true;
@ -300,10 +312,8 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
case kTypeValue:
case kTypeBlobIndex:
if (start_seqnum_ > 0) {
// we are taking incremental snapshot here
// incremental snapshots aren't supported on DB with range deletes
assert(ikey_.type != kTypeBlobIndex);
if (ikey_.sequence >= start_seqnum_) {
assert(ikey_.type != kTypeBlobIndex);
saved_key_.SetInternalKey(ikey_);
valid_ = true;
return true;
@ -316,6 +326,10 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
!iter_.iter()->IsKeyPinned() /* copy */);
skipping_saved_key = true;
}
} else if (timestamp_lb_) {
saved_key_.SetInternalKey(ikey_);
valid_ = true;
return true;
} else {
saved_key_.SetUserKey(
ikey_.user_key, !pin_thru_lifetime_ ||

@ -143,7 +143,7 @@ class DBIter final : public Iterator {
bool Valid() const override { return valid_; }
Slice key() const override {
assert(valid_);
if (start_seqnum_ > 0) {
if (start_seqnum_ > 0 || timestamp_lb_) {
return saved_key_.GetInternalKey();
} else {
const Slice ukey_and_ts = saved_key_.GetUserKey();

@ -120,12 +120,15 @@ class DBBasicTestWithTimestampBase : public DBTestBase {
}
void CheckIterUserEntry(const Iterator* it, const Slice& expected_key,
ValueType expected_value_type,
const Slice& expected_value,
const Slice& expected_ts) const {
ASSERT_TRUE(it->Valid());
ASSERT_OK(it->status());
ASSERT_EQ(expected_key, it->key());
ASSERT_EQ(expected_value, it->value());
if (kTypeValue == expected_value_type) {
ASSERT_EQ(expected_value, it->value());
}
ASSERT_EQ(expected_ts, it->timestamp());
}
@ -137,10 +140,30 @@ class DBBasicTestWithTimestampBase : public DBTestBase {
std::string ukey_and_ts;
ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
ukey_and_ts.append(expected_ts.data(), expected_ts.size());
ParsedInternalKey parsed_ikey(ukey_and_ts, expected_seq, expected_val_type);
std::string ikey;
AppendInternalKey(&ikey, parsed_ikey);
ASSERT_EQ(Slice(ikey), it->key());
ParsedInternalKey parsed_ikey;
ASSERT_TRUE(ParseInternalKey(it->key(), &parsed_ikey));
ASSERT_EQ(ukey_and_ts, parsed_ikey.user_key);
ASSERT_EQ(expected_val_type, parsed_ikey.type);
ASSERT_EQ(expected_seq, parsed_ikey.sequence);
if (expected_val_type == kTypeValue) {
ASSERT_EQ(expected_value, it->value());
}
ASSERT_EQ(expected_ts, it->timestamp());
}
void CheckIterEntry(const Iterator* it, const Slice& expected_ukey,
ValueType expected_val_type, const Slice& expected_value,
const Slice& expected_ts) {
ASSERT_TRUE(it->Valid());
ASSERT_OK(it->status());
std::string ukey_and_ts;
ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
ukey_and_ts.append(expected_ts.data(), expected_ts.size());
ParsedInternalKey parsed_ikey;
ASSERT_TRUE(ParseInternalKey(it->key(), &parsed_ikey));
ASSERT_EQ(expected_val_type, parsed_ikey.type);
ASSERT_EQ(Slice(ukey_and_ts), parsed_ikey.user_key);
if (expected_val_type == kTypeValue) {
ASSERT_EQ(expected_value, it->value());
}
@ -188,8 +211,8 @@ TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterate) {
uint64_t key = 0;
for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
it->Next(), ++count, ++key) {
CheckIterUserEntry(it.get(), Key1(key), "value" + std::to_string(i),
write_timestamps[i]);
CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
"value" + std::to_string(i), write_timestamps[i]);
}
size_t expected_count = kMaxKey - start_keys[i] + 1;
ASSERT_EQ(expected_count, count);
@ -208,8 +231,8 @@ TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterate) {
it.reset(db_->NewIterator(read_opts));
for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0;
it->Valid(); it->Next(), ++key, ++count) {
CheckIterUserEntry(it.get(), Key1(key), "value" + std::to_string(i),
write_timestamps[i]);
CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
"value" + std::to_string(i), write_timestamps[i]);
}
ASSERT_EQ(r - std::max(l, start_keys[i]), count);
l += (kMaxKey / 100);
@ -220,8 +243,8 @@ TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterate) {
}
TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) {
const int kNumKeysPerFile = 128;
const uint64_t kMaxKey = 1024;
constexpr int kNumKeysPerFile = 128;
constexpr uint64_t kMaxKey = 1024;
Options options = CurrentOptions();
options.env = env_;
options.create_if_missing = true;
@ -255,17 +278,47 @@ TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) {
int count = 0;
uint64_t key = 0;
for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
CheckIterUserEntry(it.get(), Key1(key), "value" + std::to_string(i),
write_timestamps[i]);
CheckIterEntry(it.get(), Key1(key), kTypeValue,
"value" + std::to_string(i), write_timestamps[i]);
if (i > 0) {
it->Next();
CheckIterUserEntry(it.get(), Key1(key), "value" + std::to_string(i - 1),
write_timestamps[i - 1]);
CheckIterEntry(it.get(), Key1(key), kTypeValue,
"value" + std::to_string(i - 1),
write_timestamps[i - 1]);
}
}
size_t expected_count = kMaxKey + 1;
ASSERT_EQ(expected_count, count);
}
// Delete all keys@ts=5 and check iteration result with start ts set
{
std::string write_timestamp = Timestamp(5, 0);
WriteOptions write_opts;
Slice write_ts = write_timestamp;
write_opts.timestamp = &write_ts;
for (uint64_t key = 0; key < kMaxKey + 1; ++key) {
Status s = db_->Delete(write_opts, Key1(key));
ASSERT_OK(s);
}
std::string read_timestamp = Timestamp(6, 0);
ReadOptions read_opts;
Slice read_ts = read_timestamp;
read_opts.timestamp = &read_ts;
std::string read_timestamp_lb = Timestamp(2, 0);
Slice read_ts_lb = read_timestamp_lb;
read_opts.iter_start_ts = &read_ts_lb;
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
int count = 0;
uint64_t key = 0;
for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(),
write_ts);
// Skip key@ts=3 and land on tombstone key@ts=5
it->Next();
}
ASSERT_EQ(kMaxKey + 1, count);
}
Close();
}
@ -296,34 +349,61 @@ TEST_F(DBBasicTestWithTimestamp, ForwardIterateStartSeqnum) {
for (size_t i = 0; i != write_ts_list.size(); ++i) {
Slice write_ts = write_ts_list[i];
write_opts.timestamp = &write_ts;
uint64_t k = kMinKey;
do {
Status s = db_->Put(write_opts, Key1(k), "value" + std::to_string(i));
ASSERT_OK(s);
if (k == kMaxKey) {
break;
for (uint64_t k = kMaxKey; k >= kMinKey; --k) {
Status s;
if (k % 2) {
s = db_->Put(write_opts, Key1(k), "value" + std::to_string(i));
} else {
s = db_->Delete(write_opts, Key1(k));
}
++k;
} while (k != 0);
ASSERT_OK(s);
}
start_seqs.push_back(db_->GetLatestSequenceNumber());
}
std::vector<std::string> read_ts_list;
for (int t = 0; t != kNumTimestamps - 1; ++t) {
read_ts_list.push_back(Timestamp(2 * t + 3, /*do not care*/ 17));
}
ReadOptions read_opts;
// Scan with only read_opts.iter_start_seqnum set.
for (size_t i = 0; i != read_ts_list.size(); ++i) {
Slice read_ts = read_ts_list[i];
read_opts.timestamp = &read_ts;
read_opts.iter_start_seqnum = start_seqs[i];
read_opts.iter_start_seqnum = start_seqs[i] + 1;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
SequenceNumber expected_seq = start_seqs[i] + 1;
SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1;
uint64_t key = kMinKey;
for (iter->Seek(Key1(kMinKey)); iter->Valid(); iter->Next()) {
CheckIterEntry(iter.get(), Key1(key), expected_seq, kTypeValue,
CheckIterEntry(
iter.get(), Key1(key), expected_seq,
(key % 2) ? kTypeValue : kTypeDeletionWithTimestamp,
(key % 2) ? "value" + std::to_string(i + 1) : std::string(),
write_ts_list[i + 1]);
++key;
--expected_seq;
}
}
// Scan with both read_opts.iter_start_seqnum and read_opts.iter_start_ts set.
std::vector<std::string> read_ts_lb_list;
for (int t = 0; t < kNumTimestamps - 1; ++t) {
read_ts_lb_list.push_back(Timestamp(2 * t, /*do not care*/ 17));
}
for (size_t i = 0; i < read_ts_list.size(); ++i) {
Slice read_ts = read_ts_list[i];
Slice read_ts_lb = read_ts_lb_list[i];
read_opts.timestamp = &read_ts;
read_opts.iter_start_ts = &read_ts_lb;
read_opts.iter_start_seqnum = start_seqs[i] + 1;
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
uint64_t key = kMinKey;
SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1;
for (it->Seek(Key1(kMinKey)); it->Valid(); it->Next()) {
CheckIterEntry(it.get(), Key1(key), expected_seq,
(key % 2) ? kTypeValue : kTypeDeletionWithTimestamp,
"value" + std::to_string(i + 1), write_ts_list[i + 1]);
++key;
++expected_seq;
--expected_seq;
}
}
Close();
@ -357,7 +437,7 @@ TEST_F(DBBasicTestWithTimestamp, ReseekToTargetTimestamp) {
read_opts.timestamp = &ts;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
iter->SeekToFirst();
CheckIterUserEntry(iter.get(), "foo", "value0", ts_str);
CheckIterUserEntry(iter.get(), "foo", kTypeValue, "value0", ts_str);
ASSERT_EQ(
1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
}
@ -403,7 +483,7 @@ TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) {
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
iter->Seek("a");
iter->Next();
CheckIterUserEntry(iter.get(), "b", "new_value", ts_str);
CheckIterUserEntry(iter.get(), "b", kTypeValue, "new_value", ts_str);
ASSERT_EQ(
1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
}
@ -1049,8 +1129,8 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, ForwardIterateWithPrefix) {
// Seek to kMaxKey
iter->Seek(Key1(kMaxKey));
CheckIterUserEntry(iter.get(), Key1(kMaxKey), "value" + std::to_string(i),
write_ts_list[i]);
CheckIterUserEntry(iter.get(), Key1(kMaxKey), kTypeValue,
"value" + std::to_string(i), write_ts_list[i]);
iter->Next();
ASSERT_FALSE(iter->Valid());
}
@ -1084,7 +1164,7 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, ForwardIterateWithPrefix) {
pe->Transform(saved_prev_key) != pe->Transform(start_key)) {
break;
}
CheckIterUserEntry(it.get(), Key1(expected_key),
CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue,
"value" + std::to_string(i), write_ts_list[i]);
++count;
++expected_key;

Loading…
Cancel
Save