diff --git a/db/builder.cc b/db/builder.cc index 9c7402a31..84d2396c3 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -15,6 +15,7 @@ #include "db/blob/blob_file_builder.h" #include "db/compaction/compaction_iterator.h" +#include "db/dbformat.h" #include "db/event_helpers.h" #include "db/internal_stats.h" #include "db/merge_helper.h" @@ -205,21 +206,38 @@ Status BuildTable( /*compaction=*/nullptr, compaction_filter.get(), /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low); + const size_t ts_sz = ucmp->timestamp_size(); + const bool strip_timestamp = + ts_sz > 0 && !ioptions.persist_user_defined_timestamps; + + std::string key_after_flush_buf; c_iter.SeekToFirst(); for (; c_iter.Valid(); c_iter.Next()) { const Slice& key = c_iter.key(); const Slice& value = c_iter.value(); const ParsedInternalKey& ikey = c_iter.ikey(); - // Generate a rolling 64-bit hash of the key and values - // Note : - // Here "key" integrates 'sequence_number'+'kType'+'user key'. - s = output_validator.Add(key, value); + Slice key_after_flush = key; + // If user defined timestamps will be stripped from user key after flush, + // the in memory version of the key act logically the same as one with a + // minimum timestamp. We update the timestamp here so file boundary and + // output validator, block builder all see the effect of the stripping. + if (strip_timestamp) { + key_after_flush_buf.clear(); + ReplaceInternalKeyWithMinTimestamp(&key_after_flush_buf, key, ts_sz); + key_after_flush = key_after_flush_buf; + } + + // Generate a rolling 64-bit hash of the key and values + // Note : + // Here "key" integrates 'sequence_number'+'kType'+'user key'. + s = output_validator.Add(key_after_flush, value); if (!s.ok()) { break; } - builder->Add(key, value); + builder->Add(key_after_flush, value); - s = meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type); + s = meta->UpdateBoundaries(key_after_flush, value, ikey.sequence, + ikey.type); if (!s.ok()) { break; } @@ -244,6 +262,7 @@ Status BuildTable( range_del_it->Next()) { auto tombstone = range_del_it->Tombstone(); auto kv = tombstone.Serialize(); + // TODO(yuzhangyu): handle range deletion for UDT in memtables only. builder->Add(kv.first.Encode(), kv.second); InternalKey tombstone_end = tombstone.SerializeEndKey(); meta->UpdateBoundariesForRange(kv.first, tombstone_end, tombstone.seq_, @@ -354,6 +373,8 @@ Status BuildTable( s = *io_status; } + // TODO(yuzhangyu): handle the key copy in the blob when ts should be + // stripped. if (blob_file_builder) { if (s.ok()) { s = blob_file_builder->Finish(); diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 554a184f8..2b7edc6eb 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -311,7 +311,9 @@ TEST_F(DBWALTest, Recover) { } while (ChangeWalOptions()); } -class DBWALTestWithTimestamp : public DBBasicTestWithTimestampBase { +class DBWALTestWithTimestamp + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface { public: DBWALTestWithTimestamp() : DBBasicTestWithTimestampBase("db_wal_test_with_timestamp") {} @@ -401,6 +403,54 @@ TEST_F(DBWALTestWithTimestamp, RecoverInconsistentTimestamp) { ReopenColumnFamiliesWithTs({"pikachu"}, ts_options).IsInvalidArgument()); } +TEST_P(DBWALTestWithTimestamp, RecoverAndFlush) { + // Set up the option that enables user defined timestmp size. + std::string min_ts = Timestamp(0, 0); + std::string write_ts = Timestamp(1, 0); + const size_t kTimestampSize = write_ts.size(); + TestComparator test_cmp(kTimestampSize); + Options ts_options; + ts_options.create_if_missing = true; + ts_options.comparator = &test_cmp; + bool persist_udt = test::ShouldPersistUDT(GetParam()); + ts_options.persist_user_defined_timestamps = persist_udt; + + std::string smallest_ukey_without_ts = "baz"; + std::string largest_ukey_without_ts = "foo"; + + ASSERT_OK(CreateAndReopenWithCFWithTs({"pikachu"}, ts_options)); + ASSERT_OK(Put(1, largest_ukey_without_ts, write_ts, "v1")); + ASSERT_OK(Put(1, smallest_ukey_without_ts, write_ts, "v5")); + + // Very small write buffer size to force flush memtables recovered from WAL. + ts_options.write_buffer_size = 16; + ts_options.arena_block_size = 16; + ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(1)); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(handles_[1], &level_to_files); + ASSERT_GT(level_to_files.size(), 1); + // L0 only has one SST file. + ASSERT_EQ(level_to_files[0].size(), 1); + auto meta = level_to_files[0][0]; + if (persist_udt) { + ASSERT_EQ(smallest_ukey_without_ts + write_ts, meta.smallest.user_key()); + ASSERT_EQ(largest_ukey_without_ts + write_ts, meta.largest.user_key()); + } else { + ASSERT_EQ(smallest_ukey_without_ts + min_ts, meta.smallest.user_key()); + ASSERT_EQ(largest_ukey_without_ts + min_ts, meta.largest.user_key()); + } +} + +// Param 0: test mode for the user-defined timestamp feature +INSTANTIATE_TEST_CASE_P( + RecoverAndFlush, DBWALTestWithTimestamp, + ::testing::Values( + test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp, + test::UserDefinedTimestampTestMode::kNormal)); + TEST_F(DBWALTest, RecoverWithTableHandle) { do { Options options = CurrentOptions(); diff --git a/db/dbformat.cc b/db/dbformat.cc index 720cdceac..2d24c8953 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -120,6 +120,16 @@ void StripTimestampFromInternalKey(std::string* result, const Slice& key, kNumInternalBytes); } +void ReplaceInternalKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + const size_t key_sz = key.size(); + assert(key_sz >= ts_sz + kNumInternalBytes); + result->reserve(key_sz); + result->append(key.data(), key_sz - kNumInternalBytes - ts_sz); + result->append(ts_sz, static_cast(0)); + result->append(key.data() + key_sz - kNumInternalBytes, kNumInternalBytes); +} + std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const { std::string result = "'"; if (log_err_key) { diff --git a/db/dbformat.h b/db/dbformat.h index 763d331e8..d0b3bbaaa 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -168,51 +168,57 @@ inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, EntryType GetEntryType(ValueType value_type); // Append the serialization of "key" to *result. -extern void AppendInternalKey(std::string* result, - const ParsedInternalKey& key); +void AppendInternalKey(std::string* result, const ParsedInternalKey& key); // Append the serialization of "key" to *result, replacing the original // timestamp with argument ts. -extern void AppendInternalKeyWithDifferentTimestamp( - std::string* result, const ParsedInternalKey& key, const Slice& ts); +void AppendInternalKeyWithDifferentTimestamp(std::string* result, + const ParsedInternalKey& key, + const Slice& ts); // Serialized internal key consists of user key followed by footer. // This function appends the footer to *result, assuming that *result already // contains the user key at the end. -extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s, - ValueType t); +void AppendInternalKeyFooter(std::string* result, SequenceNumber s, + ValueType t); // Append the key and a minimal timestamp to *result -extern void AppendKeyWithMinTimestamp(std::string* result, const Slice& key, - size_t ts_sz); +void AppendKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz); // Append the key and a maximal timestamp to *result -extern void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key, - size_t ts_sz); +void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz); // `key` is a user key with timestamp. Append the user key without timestamp // and the maximal timestamp to *result. -extern void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key, - size_t ts_sz); +void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz); // `key` is an internal key containing a user key without timestamp. Create a // new key in *result by padding a min timestamp of size `ts_sz` to the user key // and copying the remaining internal key bytes. -extern void PadInternalKeyWithMinTimestamp(std::string* result, - const Slice& key, size_t ts_sz); +void PadInternalKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz); // `key` is an internal key containing a user key with timestamp of size // `ts_sz`. Create a new internal key in *result by stripping the timestamp from // the user key and copying the remaining internal key bytes. -extern void StripTimestampFromInternalKey(std::string* result, const Slice& key, - size_t ts_sz); +void StripTimestampFromInternalKey(std::string* result, const Slice& key, + size_t ts_sz); + +// `key` is an internal key containing a user key with timestamp of size +// `ts_sz`. Create a new internal key in *result while replace the original +// timestamp with min timestamp. +void ReplaceInternalKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz); // Attempt to parse an internal key from "internal_key". On success, // stores the parsed data in "*result", and returns true. // // On error, returns false, leaves "*result" in an undefined state. -extern Status ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result, bool log_err_key); +Status ParseInternalKey(const Slice& internal_key, ParsedInternalKey* result, + bool log_err_key); // Returns the user key portion of an internal key. inline Slice ExtractUserKey(const Slice& internal_key) { @@ -783,8 +789,7 @@ class InternalKeySliceTransform : public SliceTransform { // Read the key of a record from a write batch. // if this record represent the default column family then cf_record // must be passed as false, otherwise it must be passed as true. -extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, - bool cf_record); +bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record); // Read record from a write batch piece from input. // tag, column_family, key, value and blob are return values. Callers own the @@ -793,9 +798,9 @@ extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, // input will be advanced to after the record. // If user-defined timestamp is enabled for a column family, then the `key` // resulting from this call will include timestamp. -extern Status ReadRecordFromWriteBatch(Slice* input, char* tag, - uint32_t* column_family, Slice* key, - Slice* value, Slice* blob, Slice* xid); +Status ReadRecordFromWriteBatch(Slice* input, char* tag, + uint32_t* column_family, Slice* key, + Slice* value, Slice* blob, Slice* xid); // When user call DeleteRange() to delete a range of keys, // we will store a serialized RangeTombstone in MemTable and SST. diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index f82d1b57b..3b6190d92 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -312,6 +312,27 @@ TEST_F(FormatTest, StripTimestampFromInternalKey) { ASSERT_EQ(kTypeValue, key_without_timestamp.type); } +TEST_F(FormatTest, ReplaceInternalKeyWithMinTimestamp) { + std::string orig_user_key = "foo"; + size_t ts_sz = 8; + orig_user_key.append(ts_sz, static_cast(1)); + std::string orig_internal_key = IKey(orig_user_key, 100, kTypeValue); + + std::string key_buf; + ReplaceInternalKeyWithMinTimestamp(&key_buf, orig_internal_key, ts_sz); + ParsedInternalKey new_key; + Slice in(key_buf); + ASSERT_OK(ParseInternalKey(in, &new_key, true /*log_err_key*/)); + + std::string min_timestamp(ts_sz, static_cast(0)); + size_t ukey_diff_offset = new_key.user_key.difference_offset(orig_user_key); + ASSERT_EQ(min_timestamp, + Slice(new_key.user_key.data() + ukey_diff_offset, ts_sz)); + ASSERT_EQ(orig_user_key.size(), new_key.user_key.size()); + ASSERT_EQ(100, new_key.sequence); + ASSERT_EQ(kTypeValue, new_key.type); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index 72332fc3a..f2915ed39 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -117,6 +117,8 @@ class FlushJobTestBase : public testing::Test { db_options_.statistics = CreateDBStatistics(); cf_options_.comparator = ucmp_; + cf_options_.persist_user_defined_timestamps = persist_udt_; + cf_options_.paranoid_file_checks = paranoid_file_checks_; std::vector column_families; cf_options_.table_factory = mock_table_factory_; @@ -149,6 +151,9 @@ class FlushJobTestBase : public testing::Test { std::atomic shutting_down_; std::shared_ptr mock_table_factory_; + bool persist_udt_ = true; + bool paranoid_file_checks_ = false; + SeqnoToTimeMapping empty_seqno_to_time_mapping_; }; @@ -600,7 +605,13 @@ TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) { } } -class FlushJobTimestampTest : public FlushJobTestBase { +// Test parameters: +// param 0): paranoid file check +// param 1): user-defined timestamp test mode +class FlushJobTimestampTest + : public FlushJobTestBase, + public testing::WithParamInterface< + std::tuple> { public: FlushJobTimestampTest() : FlushJobTestBase(test::PerThreadDBPath("flush_job_ts_gc_test"), @@ -616,13 +627,36 @@ class FlushJobTimestampTest : public FlushJobTestBase { } protected: + void SetUp() override { + paranoid_file_checks_ = std::get<0>(GetParam()); + auto udt_test_mode = std::get<1>(GetParam()); + persist_udt_ = test::ShouldPersistUDT(udt_test_mode); + FlushJobTestBase::SetUp(); + } static constexpr uint64_t kStartTs = 10; static constexpr SequenceNumber kStartSeq = 0; SequenceNumber curr_seq_{kStartSeq}; std::atomic curr_ts_{kStartTs}; + + void CheckFileMetaData(ColumnFamilyData* cfd, + const InternalKey& expected_smallest, + const InternalKey& expected_largest, + const FileMetaData* meta_from_flush) const { + ASSERT_EQ(expected_smallest.Encode(), meta_from_flush->smallest.Encode()); + ASSERT_EQ(expected_largest.Encode(), meta_from_flush->largest.Encode()); + + const VersionStorageInfo* storage_info = cfd->current()->storage_info(); + const std::vector& l0_files = storage_info->LevelFiles(0); + + ASSERT_EQ(l0_files.size(), 1); + auto installed_file_meta = l0_files[0]; + ASSERT_EQ(expected_smallest.Encode(), + installed_file_meta->smallest.Encode()); + ASSERT_EQ(expected_largest.Encode(), installed_file_meta->largest.Encode()); + } }; -TEST_F(FlushJobTimestampTest, AllKeysExpired) { +TEST_P(FlushJobTimestampTest, AllKeysExpired) { ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); autovector to_delete; @@ -669,17 +703,24 @@ TEST_F(FlushJobTimestampTest, AllKeysExpired) { { std::string key = test::EncodeInt(0); - key.append(test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1)); + if (!persist_udt_) { + // When `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` flag + // is set to false. The user-defined timestamp is stripped from user key + // during flush, making the user key logically containing the minimum + // timestamp. + key.append(test::EncodeInt(0)); + } else { + key.append(test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1)); + } InternalKey ikey(key, curr_seq_ - 1, ValueType::kTypeDeletionWithTimestamp); - ASSERT_EQ(ikey.Encode(), fmeta.smallest.Encode()); - ASSERT_EQ(ikey.Encode(), fmeta.largest.Encode()); + CheckFileMetaData(cfd, ikey, ikey, &fmeta); } job_context.Clean(); ASSERT_TRUE(to_delete.empty()); } -TEST_F(FlushJobTimestampTest, NoKeyExpired) { +TEST_P(FlushJobTimestampTest, NoKeyExpired) { ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); autovector to_delete; @@ -722,18 +763,38 @@ TEST_F(FlushJobTimestampTest, NoKeyExpired) { { std::string ukey = test::EncodeInt(0); - std::string smallest_key = - ukey + test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1); - std::string largest_key = ukey + test::EncodeInt(kStartTs); + std::string smallest_key; + std::string largest_key; + if (!persist_udt_) { + // When `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` flag + // is set to false. The user-defined timestamp is stripped from user key + // during flush, making the user key logically containing the minimum + // timestamp, which is hardcoded to be all zeros for now. + smallest_key = ukey + test::EncodeInt(0); + largest_key = ukey + test::EncodeInt(0); + } else { + smallest_key = + ukey + test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1); + largest_key = ukey + test::EncodeInt(kStartTs); + } InternalKey smallest(smallest_key, curr_seq_ - 1, ValueType::kTypeValue); InternalKey largest(largest_key, kStartSeq, ValueType::kTypeValue); - ASSERT_EQ(smallest.Encode(), fmeta.smallest.Encode()); - ASSERT_EQ(largest.Encode(), fmeta.largest.Encode()); + CheckFileMetaData(cfd, smallest, largest, &fmeta); } job_context.Clean(); ASSERT_TRUE(to_delete.empty()); } +// Param 0: paranoid file check +// Param 1: test mode for the user-defined timestamp feature +INSTANTIATE_TEST_CASE_P( + FlushJobTimestampTest, FlushJobTimestampTest, + ::testing::Combine( + ::testing::Bool(), + ::testing::Values( + test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp, + test::UserDefinedTimestampTestMode::kNormal))); + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/repair_test.cc b/db/repair_test.cc index a8cf5281d..8cca48424 100644 --- a/db/repair_test.cc +++ b/db/repair_test.cc @@ -315,7 +315,13 @@ TEST_F(RepairTest, UnflushedSst) { ASSERT_EQ(Get("key"), "val"); } -class RepairTestWithTimestamp : public DBBasicTestWithTimestampBase { +// Test parameters: +// param 0): paranoid file check +// param 1): user-defined timestamp test mode +class RepairTestWithTimestamp + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface< + std::tuple> { public: RepairTestWithTimestamp() : DBBasicTestWithTimestampBase("repair_test_with_timestamp") {} @@ -326,23 +332,46 @@ class RepairTestWithTimestamp : public DBBasicTestWithTimestampBase { } void CheckGet(const ReadOptions& read_opts, const Slice& key, - const std::string& expected_value) { + const std::string& expected_value, + const std::string& expected_ts) { std::string actual_value; - ASSERT_OK(db_->Get(read_opts, handles_[0], key, &actual_value)); + std::string actual_ts; + ASSERT_OK(db_->Get(read_opts, handles_[0], key, &actual_value, &actual_ts)); ASSERT_EQ(expected_value, actual_value); + ASSERT_EQ(expected_ts, actual_ts); + } + + void CheckFileBoundaries(const Slice& smallest_user_key, + const Slice& largest_user_key) { + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GT(level_to_files.size(), 1); + // L0 only has one SST file. + ASSERT_EQ(level_to_files[0].size(), 1); + auto file_meta = level_to_files[0][0]; + ASSERT_EQ(smallest_user_key, file_meta.smallest.user_key()); + ASSERT_EQ(largest_user_key, file_meta.largest.user_key()); } }; -TEST_F(RepairTestWithTimestamp, UnflushedSst) { +TEST_P(RepairTestWithTimestamp, UnflushedSst) { Destroy(last_options_); + bool paranoid_file_checks = std::get<0>(GetParam()); + bool persist_udt = test::ShouldPersistUDT(std::get<1>(GetParam())); + std::string smallest_ukey_without_ts = "bar"; + std::string largest_ukey_without_ts = "foo"; Options options = CurrentOptions(); options.env = env_; options.create_if_missing = true; - std::string ts = Timestamp(0, 0); - const size_t kTimestampSize = ts.size(); + std::string min_ts = Timestamp(0, 0); + std::string write_ts = Timestamp(1, 0); + const size_t kTimestampSize = write_ts.size(); TestComparator test_cmp(kTimestampSize); options.comparator = &test_cmp; + options.persist_user_defined_timestamps = persist_udt; + options.paranoid_file_checks = paranoid_file_checks; ColumnFamilyOptions cf_options(options); std::vector column_families; @@ -351,7 +380,10 @@ TEST_F(RepairTestWithTimestamp, UnflushedSst) { ASSERT_OK(DB::Open(options, dbname_, column_families, &handles_, &db_)); - ASSERT_OK(Put("key", ts, "val")); + ASSERT_OK(Put(smallest_ukey_without_ts, write_ts, + smallest_ukey_without_ts + ":val")); + ASSERT_OK( + Put(largest_ukey_without_ts, write_ts, largest_ukey_without_ts + ":val")); VectorLogPtr wal_files; ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); ASSERT_EQ(wal_files.size(), 1); @@ -381,12 +413,46 @@ TEST_F(RepairTestWithTimestamp, UnflushedSst) { ASSERT_GT(total_ssts_size, 0); } + // Check file boundaries are correct for different + // `persist_user_defined_timestamps` option values. + if (persist_udt) { + CheckFileBoundaries(smallest_ukey_without_ts + write_ts, + largest_ukey_without_ts + write_ts); + } else { + CheckFileBoundaries(smallest_ukey_without_ts + min_ts, + largest_ukey_without_ts + min_ts); + } + ReadOptions read_opts; - Slice read_ts_slice = ts; + Slice read_ts_slice = write_ts; read_opts.timestamp = &read_ts_slice; - CheckGet(read_opts, "key", "val"); + if (persist_udt) { + CheckGet(read_opts, smallest_ukey_without_ts, + smallest_ukey_without_ts + ":val", write_ts); + CheckGet(read_opts, largest_ukey_without_ts, + largest_ukey_without_ts + ":val", write_ts); + } else { + // TODO(yuzhangyu): currently when `persist_user_defined_timestamps` is + // false, ts is unconditionally stripped during flush. + // When `full_history_ts_low` is set and respected during flush. + // We should prohibit reading below `full_history_ts_low` all together. + CheckGet(read_opts, smallest_ukey_without_ts, + smallest_ukey_without_ts + ":val", min_ts); + CheckGet(read_opts, largest_ukey_without_ts, + largest_ukey_without_ts + ":val", min_ts); + } } +// Param 0: paranoid file check +// Param 1: test mode for the user-defined timestamp feature +INSTANTIATE_TEST_CASE_P( + UnflushedSst, RepairTestWithTimestamp, + ::testing::Combine( + ::testing::Bool(), + ::testing::Values( + test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp, + test::UserDefinedTimestampTestMode::kNormal))); + TEST_F(RepairTest, SeparateWalDir) { do { Options options = CurrentOptions(); diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 796d9be09..3af820b66 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -1120,11 +1120,30 @@ struct AdvancedColumnFamilyOptions { // // When it's false, the user-defined timestamps will be removed from the user // keys when data is flushed from memtables to SST files. Other places that - // user keys can be persisted like WAL and blob files go through a similar - // process. Users should call `DB::IncreaseFullHistoryTsLow` to set a cutoff - // timestamp. RocksDB refrains from flushing a memtable with data still above - // the cutoff timestamp with best effort. When users try to read below the - // cutoff timestamp, an error will be returned. + // user keys can be persisted like file boundaries in file metadata and blob + // files go through a similar process. There are two major motivations + // for this flag: + // 1) backward compatibility: if the user later decides to + // disable the user-defined timestamp feature for the column family, these SST + // files can be handled by a user comparator that is not aware of user-defined + // timestamps. + // 2) enable user-defined timestamp feature for an existing column family + // while set this flag to be `false`: user keys in the newly generated SST + // files are of the same format as the existing SST files. + // + // When setting this flag to `false`, users should also call + // `DB::IncreaseFullHistoryTsLow` to set a cutoff timestamp for flush. RocksDB + // refrains from flushing a memtable with data still above + // the cutoff timestamp with best effort. Users can do user-defined + // multi-versioned read above the cutoff timestamp. When users try to read + // below the cutoff timestamp, an error will be returned. + // + // Note that if WAL is enabled, unlike SST files, user-defined timestamps are + // persisted to WAL even if this flag is set to `false`. The benefit of this + // is that user-defined timestamps can be recovered with the caveat that users + // should flush all memtables so there is no active WAL files before doing a + // downgrade or toggling on / off the user-defined timestamp feature on a + // column family. // // Default: true (user-defined timestamps are persisted) // Not dynamically changeable, change it requires db restart and diff --git a/table/block_based/block_builder.cc b/table/block_based/block_builder.cc index 239402dc3..877df81c1 100644 --- a/table/block_based/block_builder.cc +++ b/table/block_based/block_builder.cc @@ -53,8 +53,7 @@ BlockBuilder::BlockBuilder( : block_restart_interval_(block_restart_interval), use_delta_encoding_(use_delta_encoding), use_value_delta_encoding_(use_value_delta_encoding), - ts_sz_(ts_sz), - persist_user_defined_timestamps_(persist_user_defined_timestamps), + strip_ts_sz_(persist_user_defined_timestamps ? 0 : ts_sz), is_user_key_(is_user_key), restarts_(1, 0), // First restart point is at offset 0 counter_(0), @@ -100,8 +99,8 @@ size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, // Note: this is an imprecise estimate as it accounts for the whole key size // instead of non-shared key size. estimate += key.size(); - if (ts_sz_ > 0 && !persist_user_defined_timestamps_) { - estimate -= ts_sz_; + if (strip_ts_sz_ > 0) { + estimate -= strip_ts_sz_; } // In value delta encoding we estimate the value delta size as half the full // value size since only the size field of block handle is encoded. @@ -175,13 +174,13 @@ void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value, // or Reset. This is more convenient for the caller and we can be more // clever inside BlockBuilder. On this hot code path, we want to avoid // conditional jumps like `buffer_.empty() ? ... : ...` so we can use a - // fast min operation instead, with an assertion to be sure our logic is - // sound. + // fast arithmetic operation instead, with an assertion to be sure our logic + // is sound. size_t buffer_size = buffer_.size(); size_t last_key_size = last_key_param.size(); - assert(buffer_size == 0 || buffer_size >= last_key_size); + assert(buffer_size == 0 || buffer_size >= last_key_size - strip_ts_sz_); - Slice last_key(last_key_param.data(), std::min(buffer_size, last_key_size)); + Slice last_key(last_key_param.data(), last_key_size * (buffer_size > 0)); AddWithLastKeyImpl(key, value, last_key, delta_value, buffer_size); } @@ -255,11 +254,11 @@ inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key, const Slice BlockBuilder::MaybeStripTimestampFromKey(std::string* key_buf, const Slice& key) { Slice stripped_key = key; - if (ts_sz_ > 0 && !persist_user_defined_timestamps_) { + if (strip_ts_sz_ > 0) { if (is_user_key_) { - stripped_key.remove_suffix(ts_sz_); + stripped_key.remove_suffix(strip_ts_sz_); } else { - StripTimestampFromInternalKey(key_buf, key, ts_sz_); + StripTimestampFromInternalKey(key_buf, key, strip_ts_sz_); stripped_key = *key_buf; } } diff --git a/table/block_based/block_builder.h b/table/block_based/block_builder.h index e9a050caa..f167470bb 100644 --- a/table/block_based/block_builder.h +++ b/table/block_based/block_builder.h @@ -94,11 +94,10 @@ class BlockBuilder { const bool use_delta_encoding_; // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values const bool use_value_delta_encoding_; - // Size in bytes for the user-defined timestamp in a user key. - const size_t ts_sz_; - // Whether the user-defined timestamp part in user keys should be persisted. - // If false, it will be stripped from the key before it's encoded. - const bool persist_user_defined_timestamps_; + // Size in bytes for the user-defined timestamp to strip in a user key. + // This is non-zero if there is user-defined timestamp in the user key and it + // should not be persisted. + const size_t strip_ts_sz_; // Whether the keys provided to build this block are user keys. If not, // the keys are internal keys. This will affect how timestamp stripping is // done for the key if `persisted_user_defined_timestamps_` is false and