From 340ed4fac751025dcf4368affabf950b3a417a05 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Wed, 5 Jun 2019 23:07:28 -0700 Subject: [PATCH] Add support for timestamp in Get/Put (#5079) Summary: It's useful to be able to (optionally) associate key-value pairs with user-provided timestamps. This PR is an early effort towards this goal and continues the work of facebook#4942. A suite of new unit tests exist in DBBasicTestWithTimestampWithParam. Support for timestamp requires the user to provide timestamp as a slice in `ReadOptions` and `WriteOptions`. All timestamps of the same database must share the same length, format, etc. The format of the timestamp is the same throughout the same database, and the user is responsible for providing a comparator function (Comparator) to order the tuples. Once created, the format and length of the timestamp cannot change (at least for now). Test plan (on devserver): ``` $COMPILE_WITH_ASAN=1 make -j32 all $./db_basic_test --gtest_filter=Timestamp/DBBasicTestWithTimestampWithParam.PutAndGet/* $make check ``` All tests must pass. We also run the following db_bench tests to verify whether there is regression on Get/Put while timestamp is not enabled. ``` $TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillseq,readrandom -num=1000000 $TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=1000000 ``` Repeat for 6 times for both versions. Results are as follows: ``` | | readrandom | fillrandom | | master | 16.77 MB/s | 47.05 MB/s | | PR5079 | 16.44 MB/s | 47.03 MB/s | ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5079 Differential Revision: D15132946 Pulled By: riversand963 fbshipit-source-id: 833a0d657eac21182f0f206c910a6438154c742c --- HISTORY.md | 1 + db/db_basic_test.cc | 151 ++++++++++++++++++ db/db_impl/db_impl.cc | 11 +- db/db_impl/db_impl_write.cc | 24 ++- db/dbformat.h | 27 ++++ db/memtable.cc | 16 +- db/version_set.cc | 43 ++--- include/rocksdb/comparator.h | 27 ++++ include/rocksdb/options.h | 22 ++- options/options.cc | 6 +- .../block_based/block_based_table_builder.cc | 3 +- table/block_based/block_based_table_reader.cc | 17 +- table/get_context.cc | 2 +- util/comparator.cc | 8 + 14 files changed, 318 insertions(+), 40 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index b3c2ef14a..028ddcf82 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,7 @@ * Partitions of partitioned indexes no longer affect the read amplification statistics. * Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase. * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set. +* Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put. ### New Features * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 45524b250..1aec864dd 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -1284,6 +1284,157 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) { } } } + +class DBBasicTestWithTimestampWithParam + : public DBTestBase, + public testing::WithParamInterface { + public: + DBBasicTestWithTimestampWithParam() + : DBTestBase("/db_basic_test_with_timestamp") {} + + protected: + class TestComparator : public Comparator { + private: + const Comparator* cmp_without_ts_; + + public: + explicit TestComparator(size_t ts_sz) + : Comparator(ts_sz), cmp_without_ts_(nullptr) { + cmp_without_ts_ = BytewiseComparator(); + } + + const char* Name() const override { return "TestComparator"; } + + void FindShortSuccessor(std::string*) const override {} + + void FindShortestSeparator(std::string*, const Slice&) const override {} + + int Compare(const Slice& a, const Slice& b) const override { + int r = CompareWithoutTimestamp(a, b); + if (r != 0 || 0 == timestamp_size()) { + return r; + } + return CompareTimestamp( + Slice(a.data() + a.size() - timestamp_size(), timestamp_size()), + Slice(b.data() + b.size() - timestamp_size(), timestamp_size())); + } + + int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override { + assert(a.size() >= timestamp_size()); + assert(b.size() >= timestamp_size()); + Slice k1 = StripTimestampFromUserKey(a, timestamp_size()); + Slice k2 = StripTimestampFromUserKey(b, timestamp_size()); + + return cmp_without_ts_->Compare(k1, k2); + } + + int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override { + if (!ts1.data() && !ts2.data()) { + return 0; + } else if (ts1.data() && !ts2.data()) { + return 1; + } else if (!ts1.data() && ts2.data()) { + return -1; + } + assert(ts1.size() == ts2.size()); + uint64_t low1 = 0; + uint64_t low2 = 0; + uint64_t high1 = 0; + uint64_t high2 = 0; + auto* ptr1 = const_cast(&ts1); + auto* ptr2 = const_cast(&ts2); + if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) || + !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) { + assert(false); + } + if (high1 < high2) { + return 1; + } else if (high1 > high2) { + return -1; + } + if (low1 < low2) { + return 1; + } else if (low1 > low2) { + return -1; + } + return 0; + } + }; + + Slice EncodeTimestamp(uint64_t low, uint64_t high, std::string* ts) { + assert(nullptr != ts); + ts->clear(); + PutFixed64(ts, low); + PutFixed64(ts, high); + assert(ts->size() == sizeof(low) + sizeof(high)); + return Slice(*ts); + } +}; + +TEST_P(DBBasicTestWithTimestampWithParam, PutAndGet) { + const int kNumKeysPerFile = 8192; + const size_t kNumTimestamps = 6; + bool memtable_only = GetParam(); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.env = env_; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + std::string tmp; + size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size(); + TestComparator test_cmp(ts_sz); + options.comparator = &test_cmp; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy( + 10 /*bits_per_key*/, false /*use_block_based_builder*/)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + std::vector write_ts_strs(kNumTimestamps); + std::vector read_ts_strs(kNumTimestamps); + std::vector write_ts_list; + std::vector read_ts_list; + + for (size_t i = 0; i != kNumTimestamps; ++i) { + write_ts_list.emplace_back(EncodeTimestamp(i * 2, 0, &write_ts_strs[i])); + read_ts_list.emplace_back(EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i])); + const Slice& write_ts = write_ts_list.back(); + WriteOptions wopts; + wopts.timestamp = &write_ts; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) { + ASSERT_OK(Put(cf, "key" + std::to_string(j), + "value_" + std::to_string(j) + "_" + std::to_string(i), + wopts)); + } + if (!memtable_only) { + ASSERT_OK(Flush(cf)); + } + } + } + const auto& verify_db_func = [&]() { + for (size_t i = 0; i != kNumTimestamps; ++i) { + ReadOptions ropts; + ropts.timestamp = &read_ts_list[i]; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + ColumnFamilyHandle* cfh = handles_[cf]; + for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) { + std::string value; + ASSERT_OK(db_->Get(ropts, cfh, "key" + std::to_string(j), &value)); + ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), + value); + } + } + } + }; + verify_db_func(); +} + +INSTANTIATE_TEST_CASE_P(Timestamp, DBBasicTestWithTimestampWithParam, + ::testing::Bool()); + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index ba76abc28..96b911a6d 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1376,7 +1376,16 @@ ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { Status DBImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { - return GetImpl(read_options, column_family, key, value); + if (nullptr == read_options.timestamp) { + return GetImpl(read_options, column_family, key, value); + } + Slice akey; + std::string buf; + Status s = AppendTimestamp(key, *(read_options.timestamp), &akey, &buf); + if (s.ok()) { + s = GetImpl(read_options, column_family, akey, value); + } + return s; } Status DBImpl::GetImpl(const ReadOptions& read_options, diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 02e23e269..947194ace 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -1677,11 +1677,25 @@ size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const { // can call if they wish Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { - // Pre-allocate size of write batch conservatively. - // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, - // and we allocate 11 extra bytes for key length, as well as value length. - WriteBatch batch(key.size() + value.size() + 24); - Status s = batch.Put(column_family, key, value); + if (nullptr == opt.timestamp) { + // Pre-allocate size of write batch conservatively. + // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, + // and we allocate 11 extra bytes for key length, as well as value length. + WriteBatch batch(key.size() + value.size() + 24); + Status s = batch.Put(column_family, key, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); + } + Slice akey; + std::string buf; + Status s = AppendTimestamp(key, *(opt.timestamp), &akey, &buf); + if (!s.ok()) { + return s; + } + WriteBatch batch(akey.size() + value.size() + 24); + s = batch.Put(column_family, akey, value); if (!s.ok()) { return s; } diff --git a/db/dbformat.h b/db/dbformat.h index dbf6ea6f3..c6ee5677c 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -151,6 +151,17 @@ inline Slice ExtractUserKey(const Slice& internal_key) { return Slice(internal_key.data(), internal_key.size() - 8); } +inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key, + size_t ts_sz) { + assert(internal_key.size() >= 8 + ts_sz); + return Slice(internal_key.data(), internal_key.size() - 8 - ts_sz); +} + +inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { + assert(user_key.size() >= ts_sz); + return Slice(user_key.data(), user_key.size() - ts_sz); +} + inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) { assert(internal_key.size() >= 8); const size_t n = internal_key.size(); @@ -658,4 +669,20 @@ struct ParsedInternalKeyComparator { const InternalKeyComparator* cmp; }; +// TODO (yanqin): this causes extra memory allocation and copy. Should be +// addressed in the future. +inline Status AppendTimestamp(const Slice& key, const Slice& timestamp, + Slice* ret_key, std::string* ret_buf) { + assert(ret_key != nullptr); + assert(ret_buf != nullptr); + if (key.data() + key.size() == timestamp.data()) { + *ret_key = Slice(key.data(), key.size() + timestamp.size()); + } else { + ret_buf->assign(key.data(), key.size()); + ret_buf->append(timestamp.data(), timestamp.size()); + *ret_key = Slice(*ret_buf); + } + return Status::OK(); +} + } // namespace rocksdb diff --git a/db/memtable.cc b/db/memtable.cc index 46acbbfa6..fdd1a577a 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -493,6 +493,8 @@ bool MemTable::Add(SequenceNumber s, ValueType type, p = EncodeVarint32(p, val_size); memcpy(p, value.data(), val_size); assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + if (!allow_concurrent) { // Extract prefix for insert with hint. if (insert_with_hint_prefix_extractor_ != nullptr && @@ -525,7 +527,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type, bloom_filter_->Add(prefix_extractor_->Transform(key)); } if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { - bloom_filter_->Add(key); + bloom_filter_->Add(StripTimestampFromUserKey(key, ts_sz)); } // The first sequence number inserted into the memtable @@ -559,7 +561,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type, bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key)); } if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { - bloom_filter_->AddConcurrently(key); + bloom_filter_->AddConcurrently(StripTimestampFromUserKey(key, ts_sz)); } // atomically update first_seqno_ and earliest_seqno_. @@ -632,8 +634,10 @@ static bool SaveValue(void* arg, const char* entry) { // all entries with overly large sequence numbers. uint32_t key_length; const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); - if (s->mem->GetInternalKeyComparator().user_comparator()->Equal( - Slice(key_ptr, key_length - 8), s->key->user_key())) { + Slice user_key_slice = Slice(key_ptr, key_length - 8); + if (s->mem->GetInternalKeyComparator() + .user_comparator() + ->CompareWithoutTimestamp(user_key_slice, s->key->user_key()) == 0) { // Correct user key const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); ValueType type; @@ -767,11 +771,13 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, bool found_final_value = false; bool merge_in_progress = s->IsMergeInProgress(); bool may_contain = true; + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); if (bloom_filter_) { // when both memtable_whole_key_filtering and prefix_extractor_ are set, // only do whole key filtering for Get() to save CPU if (moptions_.memtable_whole_key_filtering) { - may_contain = bloom_filter_->MayContain(user_key); + may_contain = + bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz)); } else { assert(prefix_extractor_); may_contain = diff --git a/db/version_set.cc b/db/version_set.cc index a60a4e87c..ed9a316ac 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -93,7 +93,8 @@ Status OverlapWithIterator(const Comparator* ucmp, return Status::Corruption("DB have corrupted keys"); } - if (ucmp->Compare(seek_result.user_key, largest_user_key) <= 0) { + if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <= + 0) { *overlap = true; } } @@ -171,17 +172,16 @@ class FilePicker { // Check if key is within a file's range. If search left bound and // right bound point to the same find, we are sure key falls in // range. - assert( - curr_level_ == 0 || - curr_index_in_curr_level_ == start_index_in_curr_level_ || - user_comparator_->Compare(user_key_, - ExtractUserKey(f->smallest_key)) <= 0); - - int cmp_smallest = user_comparator_->Compare(user_key_, - ExtractUserKey(f->smallest_key)); + assert(curr_level_ == 0 || + curr_index_in_curr_level_ == start_index_in_curr_level_ || + user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->smallest_key)) <= 0); + + int cmp_smallest = user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->smallest_key)); if (cmp_smallest >= 0) { - cmp_largest = user_comparator_->Compare(user_key_, - ExtractUserKey(f->largest_key)); + cmp_largest = user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->largest_key)); } // Setup file search bound for the next level based on the @@ -799,14 +799,16 @@ static bool AfterFile(const Comparator* ucmp, const Slice* user_key, const FdWithKeyRange* f) { // nullptr user_key occurs before all keys and is therefore never after *f return (user_key != nullptr && - ucmp->Compare(*user_key, ExtractUserKey(f->largest_key)) > 0); + ucmp->CompareWithoutTimestamp(*user_key, + ExtractUserKey(f->largest_key)) > 0); } static bool BeforeFile(const Comparator* ucmp, const Slice* user_key, const FdWithKeyRange* f) { // nullptr user_key occurs after all keys and is therefore never before *f return (user_key != nullptr && - ucmp->Compare(*user_key, ExtractUserKey(f->smallest_key)) < 0); + ucmp->CompareWithoutTimestamp(*user_key, + ExtractUserKey(f->smallest_key)) < 0); } bool SomeFileOverlapsRange( @@ -952,8 +954,9 @@ class LevelIterator final : public InternalIterator { bool KeyReachedUpperBound(const Slice& internal_key) { return read_options_.iterate_upper_bound != nullptr && - user_comparator_.Compare(ExtractUserKey(internal_key), - *read_options_.iterate_upper_bound) >= 0; + user_comparator_.CompareWithoutTimestamp( + ExtractUserKey(internal_key), + *read_options_.iterate_upper_bound) >= 0; } InternalIterator* NewFileIterator() { @@ -2774,11 +2777,12 @@ void VersionStorageInfo::GetOverlappingInputs( FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]); const Slice file_start = ExtractUserKey(f->smallest_key); const Slice file_limit = ExtractUserKey(f->largest_key); - if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) { + if (begin != nullptr && + user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) { // "f" is completely before specified range; skip it iter++; } else if (end != nullptr && - user_cmp->Compare(file_start, user_end) > 0) { + user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) { // "f" is completely after specified range; skip it iter++; } else { @@ -2793,10 +2797,11 @@ void VersionStorageInfo::GetOverlappingInputs( iter = index.erase(iter); if (expand_range) { if (begin != nullptr && - user_cmp->Compare(file_start, user_begin) < 0) { + user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) { user_begin = file_start; } - if (end != nullptr && user_cmp->Compare(file_limit, user_end) > 0) { + if (end != nullptr && + user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) { user_end = file_limit; } } diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index 46279f9a6..9f262367d 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -20,6 +20,19 @@ class Slice; // from multiple threads. class Comparator { public: + Comparator() : timestamp_size_(0) {} + + Comparator(size_t ts_sz) : timestamp_size_(ts_sz) {} + + Comparator(const Comparator& orig) : timestamp_size_(orig.timestamp_size_) {} + + Comparator& operator=(const Comparator& rhs) { + if (this != &rhs) { + timestamp_size_ = rhs.timestamp_size_; + } + return *this; + } + virtual ~Comparator() {} // Three-way comparison. Returns value: @@ -78,6 +91,20 @@ class Comparator { // The major use case is to determine if DataBlockHashIndex is compatible // with the customized comparator. virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; } + + inline size_t timestamp_size() const { return timestamp_size_; } + + virtual int CompareWithoutTimestamp(const Slice& a, const Slice& b) const { + return Compare(a, b); + } + + virtual int CompareTimestamp(const Slice& /*ts1*/, + const Slice& /*ts2*/) const { + return 0; + } + + private: + size_t timestamp_size_; }; // Return a builtin comparator that uses lexicographic byte-wise diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index cc7119410..307582fe6 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1255,6 +1255,14 @@ struct ReadOptions { // Default: 0 (don't filter by seqnum, return user keys) SequenceNumber iter_start_seqnum; + // Timestamp of operation. Read should return the latest data visible to the + // specified timestamp. All timestamps of the same database must be of the + // same length and format. The user is responsible for providing a customized + // compare function via Comparator to order tuples. + // The user-specified timestamp feature is still under active development, + // and the API is subject to change. + const Slice* timestamp; + ReadOptions(); ReadOptions(bool cksum, bool cache); }; @@ -1307,12 +1315,24 @@ struct WriteOptions { // Default: false bool low_pri; + // Timestamp of write operation, e.g. Put. All timestamps of the same + // database must share the same length and format. The user is also + // responsible for providing a customized compare function via Comparator to + // order tuples. If the user wants to enable timestamp, then + // all write operations must be associated with timestamp because RocksDB, as + // a single-node storage engine currently has no knowledge of global time, + // thus has to rely on the application. + // The user-specified timestamp feature is still under active development, + // and the API is subject to change. + const Slice* timestamp; + WriteOptions() : sync(false), disableWAL(false), ignore_missing_column_families(false), no_slowdown(false), - low_pri(false) {} + low_pri(false), + timestamp(nullptr) {} }; // Options that control flush operations diff --git a/options/options.cc b/options/options.cc index a5037ee78..8977b5890 100644 --- a/options/options.cc +++ b/options/options.cc @@ -600,7 +600,8 @@ ReadOptions::ReadOptions() pin_data(false), background_purge_on_iterator_cleanup(false), ignore_range_deletions(false), - iter_start_seqnum(0) {} + iter_start_seqnum(0), + timestamp(nullptr) {} ReadOptions::ReadOptions(bool cksum, bool cache) : snapshot(nullptr), @@ -618,6 +619,7 @@ ReadOptions::ReadOptions(bool cksum, bool cache) pin_data(false), background_purge_on_iterator_cleanup(false), ignore_range_deletions(false), - iter_start_seqnum(0) {} + iter_start_seqnum(0), + timestamp(nullptr) {} } // namespace rocksdb diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 9769e394f..cae93f7f2 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -531,7 +531,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { // Note: PartitionedFilterBlockBuilder requires key being added to filter // builder after being added to index builder. if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) { - r->filter_builder->Add(ExtractUserKey(key)); + size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size(); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); } r->last_key.assign(key.data(), key.size()); diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 2fdaf2afd..37bbc3b52 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2672,8 +2672,11 @@ bool BlockBasedTable::FullFilterKeyMayMatch( const Slice* const const_ikey_ptr = &internal_key; bool may_match = true; if (filter->whole_key_filtering()) { - may_match = filter->KeyMayMatch(user_key, prefix_extractor, kNotValid, - no_io, const_ikey_ptr); + size_t ts_sz = + rep_->internal_comparator.user_comparator()->timestamp_size(); + Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); + may_match = filter->KeyMayMatch(user_key_without_ts, prefix_extractor, + kNotValid, no_io, const_ikey_ptr); } else if (!read_options.total_order_seek && prefix_extractor && rep_->table_properties->prefix_extractor_name.compare( prefix_extractor->Name()) == 0 && @@ -2755,6 +2758,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, iiter_unique_ptr.reset(iiter); } + size_t ts_sz = + rep_->internal_comparator.user_comparator()->timestamp_size(); bool matched = false; // if such user key mathced a key in SST bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { @@ -2762,8 +2767,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, bool not_exist_in_filter = filter != nullptr && filter->IsBlockBased() == true && - !filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor, - handle.offset(), no_io); + !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz), + prefix_extractor, handle.offset(), no_io); if (not_exist_in_filter) { // Not found @@ -2793,7 +2798,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } bool may_exist = biter.SeekForGet(key); - if (!may_exist) { + // If user-specified timestamp is supported, we cannot end the search + // just because hash index lookup indicates the key+ts does not exist. + if (!may_exist && ts_sz == 0) { // HashSeek cannot find the key this block and the the iter is not // the end of the block, i.e. cannot be in the following blocks // either. In this case, the seek_key cannot be found, so we break diff --git a/table/get_context.cc b/table/get_context.cc index 24c9ba7d5..9be16b062 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -182,7 +182,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, assert(matched); assert((state_ != kMerge && parsed_key.type != kTypeMerge) || merge_context_ != nullptr); - if (ucmp_->Equal(parsed_key.user_key, user_key_)) { + if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) { *matched = true; // If the value is not in the snapshot, skip it if (!CheckCallback(parsed_key.sequence)) { diff --git a/util/comparator.cc b/util/comparator.cc index eab17ebcc..717ebb523 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -124,6 +124,10 @@ class BytewiseComparatorImpl : public Comparator { bool CanKeysWithDifferentByteContentsBeEqual() const override { return false; } + + int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override { + return a.compare(b); + } }; class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { @@ -192,6 +196,10 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { bool CanKeysWithDifferentByteContentsBeEqual() const override { return false; } + + int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override { + return -a.compare(b); + } }; }// namespace