Add support for timestamp in Get/Put (#5079)

Summary:
It's useful to be able to (optionally) associate key-value pairs with user-provided timestamps. This PR is an early effort towards this goal and continues the work of facebook#4942. A suite of new unit tests exist in DBBasicTestWithTimestampWithParam. Support for timestamp requires the user to provide timestamp as a slice in `ReadOptions` and `WriteOptions`. All timestamps of the same database must share the same length, format, etc. The format of the timestamp is the same throughout the same database, and the user is responsible for providing a comparator function (Comparator) to order the <key, timestamp> tuples. Once created, the format and length of the timestamp cannot change (at least for now).

Test plan (on devserver):
```
$COMPILE_WITH_ASAN=1 make -j32 all
$./db_basic_test --gtest_filter=Timestamp/DBBasicTestWithTimestampWithParam.PutAndGet/*
$make check
```
All tests must pass.

We also run the following db_bench tests to verify whether there is regression on Get/Put while timestamp is not enabled.
```
$TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillseq,readrandom -num=1000000
$TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=1000000
```
Repeat for 6 times for both versions.

Results are as follows:
```
|        | readrandom | fillrandom |
| master | 16.77 MB/s | 47.05 MB/s |
| PR5079 | 16.44 MB/s | 47.03 MB/s |
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5079

Differential Revision: D15132946

Pulled By: riversand963

fbshipit-source-id: 833a0d657eac21182f0f206c910a6438154c742c
main
Yanqin Jin 6 years ago committed by Facebook Github Bot
parent cb1bf09bfc
commit 340ed4fac7
  1. 1
      HISTORY.md
  2. 151
      db/db_basic_test.cc
  3. 9
      db/db_impl/db_impl.cc
  4. 14
      db/db_impl/db_impl_write.cc
  5. 27
      db/dbformat.h
  6. 16
      db/memtable.cc
  7. 37
      db/version_set.cc
  8. 27
      include/rocksdb/comparator.h
  9. 22
      include/rocksdb/options.h
  10. 6
      options/options.cc
  11. 3
      table/block_based/block_based_table_builder.cc
  12. 17
      table/block_based/block_based_table_reader.cc
  13. 2
      table/get_context.cc
  14. 8
      util/comparator.cc

@ -5,6 +5,7 @@
* Partitions of partitioned indexes no longer affect the read amplification statistics. * Partitions of partitioned indexes no longer affect the read amplification statistics.
* Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase. * Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase.
* options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set. * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set.
* Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put.
### New Features ### New Features
* Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.

@ -1284,6 +1284,157 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
} }
} }
} }
class DBBasicTestWithTimestampWithParam
: public DBTestBase,
public testing::WithParamInterface<bool> {
public:
DBBasicTestWithTimestampWithParam()
: DBTestBase("/db_basic_test_with_timestamp") {}
protected:
class TestComparator : public Comparator {
private:
const Comparator* cmp_without_ts_;
public:
explicit TestComparator(size_t ts_sz)
: Comparator(ts_sz), cmp_without_ts_(nullptr) {
cmp_without_ts_ = BytewiseComparator();
}
const char* Name() const override { return "TestComparator"; }
void FindShortSuccessor(std::string*) const override {}
void FindShortestSeparator(std::string*, const Slice&) const override {}
int Compare(const Slice& a, const Slice& b) const override {
int r = CompareWithoutTimestamp(a, b);
if (r != 0 || 0 == timestamp_size()) {
return r;
}
return CompareTimestamp(
Slice(a.data() + a.size() - timestamp_size(), timestamp_size()),
Slice(b.data() + b.size() - timestamp_size(), timestamp_size()));
}
int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
assert(a.size() >= timestamp_size());
assert(b.size() >= timestamp_size());
Slice k1 = StripTimestampFromUserKey(a, timestamp_size());
Slice k2 = StripTimestampFromUserKey(b, timestamp_size());
return cmp_without_ts_->Compare(k1, k2);
}
int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
if (!ts1.data() && !ts2.data()) {
return 0;
} else if (ts1.data() && !ts2.data()) {
return 1;
} else if (!ts1.data() && ts2.data()) {
return -1;
}
assert(ts1.size() == ts2.size());
uint64_t low1 = 0;
uint64_t low2 = 0;
uint64_t high1 = 0;
uint64_t high2 = 0;
auto* ptr1 = const_cast<Slice*>(&ts1);
auto* ptr2 = const_cast<Slice*>(&ts2);
if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) ||
!GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) {
assert(false);
}
if (high1 < high2) {
return 1;
} else if (high1 > high2) {
return -1;
}
if (low1 < low2) {
return 1;
} else if (low1 > low2) {
return -1;
}
return 0;
}
};
Slice EncodeTimestamp(uint64_t low, uint64_t high, std::string* ts) {
assert(nullptr != ts);
ts->clear();
PutFixed64(ts, low);
PutFixed64(ts, high);
assert(ts->size() == sizeof(low) + sizeof(high));
return Slice(*ts);
}
};
TEST_P(DBBasicTestWithTimestampWithParam, PutAndGet) {
const int kNumKeysPerFile = 8192;
const size_t kNumTimestamps = 6;
bool memtable_only = GetParam();
Options options = CurrentOptions();
options.create_if_missing = true;
options.env = env_;
options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
std::string tmp;
size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size();
TestComparator test_cmp(ts_sz);
options.comparator = &test_cmp;
BlockBasedTableOptions bbto;
bbto.filter_policy.reset(NewBloomFilterPolicy(
10 /*bits_per_key*/, false /*use_block_based_builder*/));
bbto.whole_key_filtering = true;
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
DestroyAndReopen(options);
CreateAndReopenWithCF({"pikachu"}, options);
size_t num_cfs = handles_.size();
ASSERT_EQ(2, num_cfs);
std::vector<std::string> write_ts_strs(kNumTimestamps);
std::vector<std::string> read_ts_strs(kNumTimestamps);
std::vector<Slice> write_ts_list;
std::vector<Slice> read_ts_list;
for (size_t i = 0; i != kNumTimestamps; ++i) {
write_ts_list.emplace_back(EncodeTimestamp(i * 2, 0, &write_ts_strs[i]));
read_ts_list.emplace_back(EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i]));
const Slice& write_ts = write_ts_list.back();
WriteOptions wopts;
wopts.timestamp = &write_ts;
for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
ASSERT_OK(Put(cf, "key" + std::to_string(j),
"value_" + std::to_string(j) + "_" + std::to_string(i),
wopts));
}
if (!memtable_only) {
ASSERT_OK(Flush(cf));
}
}
}
const auto& verify_db_func = [&]() {
for (size_t i = 0; i != kNumTimestamps; ++i) {
ReadOptions ropts;
ropts.timestamp = &read_ts_list[i];
for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
ColumnFamilyHandle* cfh = handles_[cf];
for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
std::string value;
ASSERT_OK(db_->Get(ropts, cfh, "key" + std::to_string(j), &value));
ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
value);
}
}
}
};
verify_db_func();
}
INSTANTIATE_TEST_CASE_P(Timestamp, DBBasicTestWithTimestampWithParam,
::testing::Bool());
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -1376,7 +1376,16 @@ ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
Status DBImpl::Get(const ReadOptions& read_options, Status DBImpl::Get(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
PinnableSlice* value) { PinnableSlice* value) {
if (nullptr == read_options.timestamp) {
return GetImpl(read_options, column_family, key, value); return GetImpl(read_options, column_family, key, value);
}
Slice akey;
std::string buf;
Status s = AppendTimestamp(key, *(read_options.timestamp), &akey, &buf);
if (s.ok()) {
s = GetImpl(read_options, column_family, akey, value);
}
return s;
} }
Status DBImpl::GetImpl(const ReadOptions& read_options, Status DBImpl::GetImpl(const ReadOptions& read_options,

@ -1677,6 +1677,7 @@ size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const {
// can call if they wish // can call if they wish
Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
const Slice& key, const Slice& value) { const Slice& key, const Slice& value) {
if (nullptr == opt.timestamp) {
// Pre-allocate size of write batch conservatively. // Pre-allocate size of write batch conservatively.
// 8 bytes are taken by header, 4 bytes for count, 1 byte for type, // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
// and we allocate 11 extra bytes for key length, as well as value length. // and we allocate 11 extra bytes for key length, as well as value length.
@ -1686,6 +1687,19 @@ Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
return s; return s;
} }
return Write(opt, &batch); return Write(opt, &batch);
}
Slice akey;
std::string buf;
Status s = AppendTimestamp(key, *(opt.timestamp), &akey, &buf);
if (!s.ok()) {
return s;
}
WriteBatch batch(akey.size() + value.size() + 24);
s = batch.Put(column_family, akey, value);
if (!s.ok()) {
return s;
}
return Write(opt, &batch);
} }
Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,

@ -151,6 +151,17 @@ inline Slice ExtractUserKey(const Slice& internal_key) {
return Slice(internal_key.data(), internal_key.size() - 8); return Slice(internal_key.data(), internal_key.size() - 8);
} }
inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
size_t ts_sz) {
assert(internal_key.size() >= 8 + ts_sz);
return Slice(internal_key.data(), internal_key.size() - 8 - ts_sz);
}
inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
assert(user_key.size() >= ts_sz);
return Slice(user_key.data(), user_key.size() - ts_sz);
}
inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) { inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
assert(internal_key.size() >= 8); assert(internal_key.size() >= 8);
const size_t n = internal_key.size(); const size_t n = internal_key.size();
@ -658,4 +669,20 @@ struct ParsedInternalKeyComparator {
const InternalKeyComparator* cmp; const InternalKeyComparator* cmp;
}; };
// TODO (yanqin): this causes extra memory allocation and copy. Should be
// addressed in the future.
inline Status AppendTimestamp(const Slice& key, const Slice& timestamp,
Slice* ret_key, std::string* ret_buf) {
assert(ret_key != nullptr);
assert(ret_buf != nullptr);
if (key.data() + key.size() == timestamp.data()) {
*ret_key = Slice(key.data(), key.size() + timestamp.size());
} else {
ret_buf->assign(key.data(), key.size());
ret_buf->append(timestamp.data(), timestamp.size());
*ret_key = Slice(*ret_buf);
}
return Status::OK();
}
} // namespace rocksdb } // namespace rocksdb

@ -493,6 +493,8 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
p = EncodeVarint32(p, val_size); p = EncodeVarint32(p, val_size);
memcpy(p, value.data(), val_size); memcpy(p, value.data(), val_size);
assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
if (!allow_concurrent) { if (!allow_concurrent) {
// Extract prefix for insert with hint. // Extract prefix for insert with hint.
if (insert_with_hint_prefix_extractor_ != nullptr && if (insert_with_hint_prefix_extractor_ != nullptr &&
@ -525,7 +527,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
bloom_filter_->Add(prefix_extractor_->Transform(key)); bloom_filter_->Add(prefix_extractor_->Transform(key));
} }
if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
bloom_filter_->Add(key); bloom_filter_->Add(StripTimestampFromUserKey(key, ts_sz));
} }
// The first sequence number inserted into the memtable // The first sequence number inserted into the memtable
@ -559,7 +561,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key)); bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key));
} }
if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
bloom_filter_->AddConcurrently(key); bloom_filter_->AddConcurrently(StripTimestampFromUserKey(key, ts_sz));
} }
// atomically update first_seqno_ and earliest_seqno_. // atomically update first_seqno_ and earliest_seqno_.
@ -632,8 +634,10 @@ static bool SaveValue(void* arg, const char* entry) {
// all entries with overly large sequence numbers. // all entries with overly large sequence numbers.
uint32_t key_length; uint32_t key_length;
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if (s->mem->GetInternalKeyComparator().user_comparator()->Equal( Slice user_key_slice = Slice(key_ptr, key_length - 8);
Slice(key_ptr, key_length - 8), s->key->user_key())) { if (s->mem->GetInternalKeyComparator()
.user_comparator()
->CompareWithoutTimestamp(user_key_slice, s->key->user_key()) == 0) {
// Correct user key // Correct user key
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
ValueType type; ValueType type;
@ -767,11 +771,13 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
bool found_final_value = false; bool found_final_value = false;
bool merge_in_progress = s->IsMergeInProgress(); bool merge_in_progress = s->IsMergeInProgress();
bool may_contain = true; bool may_contain = true;
size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
if (bloom_filter_) { if (bloom_filter_) {
// when both memtable_whole_key_filtering and prefix_extractor_ are set, // when both memtable_whole_key_filtering and prefix_extractor_ are set,
// only do whole key filtering for Get() to save CPU // only do whole key filtering for Get() to save CPU
if (moptions_.memtable_whole_key_filtering) { if (moptions_.memtable_whole_key_filtering) {
may_contain = bloom_filter_->MayContain(user_key); may_contain =
bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz));
} else { } else {
assert(prefix_extractor_); assert(prefix_extractor_);
may_contain = may_contain =

@ -93,7 +93,8 @@ Status OverlapWithIterator(const Comparator* ucmp,
return Status::Corruption("DB have corrupted keys"); return Status::Corruption("DB have corrupted keys");
} }
if (ucmp->Compare(seek_result.user_key, largest_user_key) <= 0) { if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <=
0) {
*overlap = true; *overlap = true;
} }
} }
@ -171,17 +172,16 @@ class FilePicker {
// Check if key is within a file's range. If search left bound and // Check if key is within a file's range. If search left bound and
// right bound point to the same find, we are sure key falls in // right bound point to the same find, we are sure key falls in
// range. // range.
assert( assert(curr_level_ == 0 ||
curr_level_ == 0 ||
curr_index_in_curr_level_ == start_index_in_curr_level_ || curr_index_in_curr_level_ == start_index_in_curr_level_ ||
user_comparator_->Compare(user_key_, user_comparator_->CompareWithoutTimestamp(
ExtractUserKey(f->smallest_key)) <= 0); user_key_, ExtractUserKey(f->smallest_key)) <= 0);
int cmp_smallest = user_comparator_->Compare(user_key_, int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
ExtractUserKey(f->smallest_key)); user_key_, ExtractUserKey(f->smallest_key));
if (cmp_smallest >= 0) { if (cmp_smallest >= 0) {
cmp_largest = user_comparator_->Compare(user_key_, cmp_largest = user_comparator_->CompareWithoutTimestamp(
ExtractUserKey(f->largest_key)); user_key_, ExtractUserKey(f->largest_key));
} }
// Setup file search bound for the next level based on the // Setup file search bound for the next level based on the
@ -799,14 +799,16 @@ static bool AfterFile(const Comparator* ucmp,
const Slice* user_key, const FdWithKeyRange* f) { const Slice* user_key, const FdWithKeyRange* f) {
// nullptr user_key occurs before all keys and is therefore never after *f // nullptr user_key occurs before all keys and is therefore never after *f
return (user_key != nullptr && return (user_key != nullptr &&
ucmp->Compare(*user_key, ExtractUserKey(f->largest_key)) > 0); ucmp->CompareWithoutTimestamp(*user_key,
ExtractUserKey(f->largest_key)) > 0);
} }
static bool BeforeFile(const Comparator* ucmp, static bool BeforeFile(const Comparator* ucmp,
const Slice* user_key, const FdWithKeyRange* f) { const Slice* user_key, const FdWithKeyRange* f) {
// nullptr user_key occurs after all keys and is therefore never before *f // nullptr user_key occurs after all keys and is therefore never before *f
return (user_key != nullptr && return (user_key != nullptr &&
ucmp->Compare(*user_key, ExtractUserKey(f->smallest_key)) < 0); ucmp->CompareWithoutTimestamp(*user_key,
ExtractUserKey(f->smallest_key)) < 0);
} }
bool SomeFileOverlapsRange( bool SomeFileOverlapsRange(
@ -952,7 +954,8 @@ class LevelIterator final : public InternalIterator {
bool KeyReachedUpperBound(const Slice& internal_key) { bool KeyReachedUpperBound(const Slice& internal_key) {
return read_options_.iterate_upper_bound != nullptr && return read_options_.iterate_upper_bound != nullptr &&
user_comparator_.Compare(ExtractUserKey(internal_key), user_comparator_.CompareWithoutTimestamp(
ExtractUserKey(internal_key),
*read_options_.iterate_upper_bound) >= 0; *read_options_.iterate_upper_bound) >= 0;
} }
@ -2774,11 +2777,12 @@ void VersionStorageInfo::GetOverlappingInputs(
FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]); FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]);
const Slice file_start = ExtractUserKey(f->smallest_key); const Slice file_start = ExtractUserKey(f->smallest_key);
const Slice file_limit = ExtractUserKey(f->largest_key); const Slice file_limit = ExtractUserKey(f->largest_key);
if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) { if (begin != nullptr &&
user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) {
// "f" is completely before specified range; skip it // "f" is completely before specified range; skip it
iter++; iter++;
} else if (end != nullptr && } else if (end != nullptr &&
user_cmp->Compare(file_start, user_end) > 0) { user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) {
// "f" is completely after specified range; skip it // "f" is completely after specified range; skip it
iter++; iter++;
} else { } else {
@ -2793,10 +2797,11 @@ void VersionStorageInfo::GetOverlappingInputs(
iter = index.erase(iter); iter = index.erase(iter);
if (expand_range) { if (expand_range) {
if (begin != nullptr && if (begin != nullptr &&
user_cmp->Compare(file_start, user_begin) < 0) { user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) {
user_begin = file_start; user_begin = file_start;
} }
if (end != nullptr && user_cmp->Compare(file_limit, user_end) > 0) { if (end != nullptr &&
user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) {
user_end = file_limit; user_end = file_limit;
} }
} }

@ -20,6 +20,19 @@ class Slice;
// from multiple threads. // from multiple threads.
class Comparator { class Comparator {
public: public:
Comparator() : timestamp_size_(0) {}
Comparator(size_t ts_sz) : timestamp_size_(ts_sz) {}
Comparator(const Comparator& orig) : timestamp_size_(orig.timestamp_size_) {}
Comparator& operator=(const Comparator& rhs) {
if (this != &rhs) {
timestamp_size_ = rhs.timestamp_size_;
}
return *this;
}
virtual ~Comparator() {} virtual ~Comparator() {}
// Three-way comparison. Returns value: // Three-way comparison. Returns value:
@ -78,6 +91,20 @@ class Comparator {
// The major use case is to determine if DataBlockHashIndex is compatible // The major use case is to determine if DataBlockHashIndex is compatible
// with the customized comparator. // with the customized comparator.
virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; } virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; }
inline size_t timestamp_size() const { return timestamp_size_; }
virtual int CompareWithoutTimestamp(const Slice& a, const Slice& b) const {
return Compare(a, b);
}
virtual int CompareTimestamp(const Slice& /*ts1*/,
const Slice& /*ts2*/) const {
return 0;
}
private:
size_t timestamp_size_;
}; };
// Return a builtin comparator that uses lexicographic byte-wise // Return a builtin comparator that uses lexicographic byte-wise

@ -1255,6 +1255,14 @@ struct ReadOptions {
// Default: 0 (don't filter by seqnum, return user keys) // Default: 0 (don't filter by seqnum, return user keys)
SequenceNumber iter_start_seqnum; SequenceNumber iter_start_seqnum;
// Timestamp of operation. Read should return the latest data visible to the
// specified timestamp. All timestamps of the same database must be of the
// same length and format. The user is responsible for providing a customized
// compare function via Comparator to order <key, timestamp> tuples.
// The user-specified timestamp feature is still under active development,
// and the API is subject to change.
const Slice* timestamp;
ReadOptions(); ReadOptions();
ReadOptions(bool cksum, bool cache); ReadOptions(bool cksum, bool cache);
}; };
@ -1307,12 +1315,24 @@ struct WriteOptions {
// Default: false // Default: false
bool low_pri; bool low_pri;
// Timestamp of write operation, e.g. Put. All timestamps of the same
// database must share the same length and format. The user is also
// responsible for providing a customized compare function via Comparator to
// order <key, timestamp> tuples. If the user wants to enable timestamp, then
// all write operations must be associated with timestamp because RocksDB, as
// a single-node storage engine currently has no knowledge of global time,
// thus has to rely on the application.
// The user-specified timestamp feature is still under active development,
// and the API is subject to change.
const Slice* timestamp;
WriteOptions() WriteOptions()
: sync(false), : sync(false),
disableWAL(false), disableWAL(false),
ignore_missing_column_families(false), ignore_missing_column_families(false),
no_slowdown(false), no_slowdown(false),
low_pri(false) {} low_pri(false),
timestamp(nullptr) {}
}; };
// Options that control flush operations // Options that control flush operations

@ -600,7 +600,8 @@ ReadOptions::ReadOptions()
pin_data(false), pin_data(false),
background_purge_on_iterator_cleanup(false), background_purge_on_iterator_cleanup(false),
ignore_range_deletions(false), ignore_range_deletions(false),
iter_start_seqnum(0) {} iter_start_seqnum(0),
timestamp(nullptr) {}
ReadOptions::ReadOptions(bool cksum, bool cache) ReadOptions::ReadOptions(bool cksum, bool cache)
: snapshot(nullptr), : snapshot(nullptr),
@ -618,6 +619,7 @@ ReadOptions::ReadOptions(bool cksum, bool cache)
pin_data(false), pin_data(false),
background_purge_on_iterator_cleanup(false), background_purge_on_iterator_cleanup(false),
ignore_range_deletions(false), ignore_range_deletions(false),
iter_start_seqnum(0) {} iter_start_seqnum(0),
timestamp(nullptr) {}
} // namespace rocksdb } // namespace rocksdb

@ -531,7 +531,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
// Note: PartitionedFilterBlockBuilder requires key being added to filter // Note: PartitionedFilterBlockBuilder requires key being added to filter
// builder after being added to index builder. // builder after being added to index builder.
if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) { if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) {
r->filter_builder->Add(ExtractUserKey(key)); size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size();
r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
} }
r->last_key.assign(key.data(), key.size()); r->last_key.assign(key.data(), key.size());

@ -2672,8 +2672,11 @@ bool BlockBasedTable::FullFilterKeyMayMatch(
const Slice* const const_ikey_ptr = &internal_key; const Slice* const const_ikey_ptr = &internal_key;
bool may_match = true; bool may_match = true;
if (filter->whole_key_filtering()) { if (filter->whole_key_filtering()) {
may_match = filter->KeyMayMatch(user_key, prefix_extractor, kNotValid, size_t ts_sz =
no_io, const_ikey_ptr); rep_->internal_comparator.user_comparator()->timestamp_size();
Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
may_match = filter->KeyMayMatch(user_key_without_ts, prefix_extractor,
kNotValid, no_io, const_ikey_ptr);
} else if (!read_options.total_order_seek && prefix_extractor && } else if (!read_options.total_order_seek && prefix_extractor &&
rep_->table_properties->prefix_extractor_name.compare( rep_->table_properties->prefix_extractor_name.compare(
prefix_extractor->Name()) == 0 && prefix_extractor->Name()) == 0 &&
@ -2755,6 +2758,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
iiter_unique_ptr.reset(iiter); iiter_unique_ptr.reset(iiter);
} }
size_t ts_sz =
rep_->internal_comparator.user_comparator()->timestamp_size();
bool matched = false; // if such user key mathced a key in SST bool matched = false; // if such user key mathced a key in SST
bool done = false; bool done = false;
for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
@ -2762,8 +2767,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
bool not_exist_in_filter = bool not_exist_in_filter =
filter != nullptr && filter->IsBlockBased() == true && filter != nullptr && filter->IsBlockBased() == true &&
!filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor, !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
handle.offset(), no_io); prefix_extractor, handle.offset(), no_io);
if (not_exist_in_filter) { if (not_exist_in_filter) {
// Not found // Not found
@ -2793,7 +2798,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
} }
bool may_exist = biter.SeekForGet(key); bool may_exist = biter.SeekForGet(key);
if (!may_exist) { // If user-specified timestamp is supported, we cannot end the search
// just because hash index lookup indicates the key+ts does not exist.
if (!may_exist && ts_sz == 0) {
// HashSeek cannot find the key this block and the the iter is not // HashSeek cannot find the key this block and the the iter is not
// the end of the block, i.e. cannot be in the following blocks // the end of the block, i.e. cannot be in the following blocks
// either. In this case, the seek_key cannot be found, so we break // either. In this case, the seek_key cannot be found, so we break

@ -182,7 +182,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
assert(matched); assert(matched);
assert((state_ != kMerge && parsed_key.type != kTypeMerge) || assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
merge_context_ != nullptr); merge_context_ != nullptr);
if (ucmp_->Equal(parsed_key.user_key, user_key_)) { if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) {
*matched = true; *matched = true;
// If the value is not in the snapshot, skip it // If the value is not in the snapshot, skip it
if (!CheckCallback(parsed_key.sequence)) { if (!CheckCallback(parsed_key.sequence)) {

@ -124,6 +124,10 @@ class BytewiseComparatorImpl : public Comparator {
bool CanKeysWithDifferentByteContentsBeEqual() const override { bool CanKeysWithDifferentByteContentsBeEqual() const override {
return false; return false;
} }
int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
return a.compare(b);
}
}; };
class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
@ -192,6 +196,10 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
bool CanKeysWithDifferentByteContentsBeEqual() const override { bool CanKeysWithDifferentByteContentsBeEqual() const override {
return false; return false;
} }
int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
return -a.compare(b);
}
}; };
}// namespace }// namespace

Loading…
Cancel
Save