Add seqno to time mapping (#10338)
	
		
	
				
					
				
			Summary: Which will be used for tiered storage to preclude hot data from compacting to the cold tier (the last level). Internally, adding seqno to time mapping. A periodic_task is scheduled to record the current_seqno -> current_time in certain cadence. When memtable flush, the mapping informaiton is stored in sstable property. During compaction, the mapping information are merged and get the approximate time of sequence number, which is used to determine if a key is recently inserted or not and preclude it from the last level if it's recently inserted (within the `preclude_last_level_data_seconds`). Pull Request resolved: https://github.com/facebook/rocksdb/pull/10338 Test Plan: CI Reviewed By: siying Differential Revision: D37810187 Pulled By: jay-zhuang fbshipit-source-id: 6953be7a18a99de8b1cb3b162d712f79c2b4899fmain
							parent
							
								
									66685d6aa1
								
							
						
					
					
						commit
						a3acf2ef87
					
				| @ -0,0 +1,612 @@ | ||||
| //  Copyright (c) Meta Platforms, Inc. and affiliates.
 | ||||
| //
 | ||||
| //  This source code is licensed under both the GPLv2 (found in the
 | ||||
| //  COPYING file in the root directory) and Apache 2.0 License
 | ||||
| //  (found in the LICENSE.Apache file in the root directory).
 | ||||
| 
 | ||||
| #include "db/db_test_util.h" | ||||
| #include "db/periodic_work_scheduler.h" | ||||
| #include "db/seqno_to_time_mapping.h" | ||||
| #include "port/stack_trace.h" | ||||
| #include "test_util/mock_time_env.h" | ||||
| 
 | ||||
| #ifndef ROCKSDB_LITE | ||||
| 
 | ||||
| namespace ROCKSDB_NAMESPACE { | ||||
| 
 | ||||
| class SeqnoTimeTest : public DBTestBase { | ||||
|  public: | ||||
|   SeqnoTimeTest() : DBTestBase("seqno_time_test", /*env_do_fsync=*/false) { | ||||
|     mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock()); | ||||
|     mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_); | ||||
|   } | ||||
| 
 | ||||
|  protected: | ||||
|   std::unique_ptr<Env> mock_env_; | ||||
|   std::shared_ptr<MockSystemClock> mock_clock_; | ||||
| 
 | ||||
|   void SetUp() override { | ||||
|     mock_clock_->InstallTimedWaitFixCallback(); | ||||
|     SyncPoint::GetInstance()->SetCallBack( | ||||
|         "DBImpl::StartPeriodicWorkScheduler:Init", [&](void* arg) { | ||||
|           auto* periodic_work_scheduler_ptr = | ||||
|               reinterpret_cast<PeriodicWorkScheduler**>(arg); | ||||
|           *periodic_work_scheduler_ptr = | ||||
|               PeriodicWorkTestScheduler::Default(mock_clock_); | ||||
|         }); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| TEST_F(SeqnoTimeTest, BasicSeqnoToTimeMapping) { | ||||
|   Options options = CurrentOptions(); | ||||
|   options.preclude_last_level_data_seconds = 10000; | ||||
|   options.env = mock_env_.get(); | ||||
|   options.disable_auto_compactions = true; | ||||
|   DestroyAndReopen(options); | ||||
| 
 | ||||
|   std::set<uint64_t> checked_file_nums; | ||||
|   SequenceNumber start_seq = dbfull()->GetLatestSequenceNumber(); | ||||
|   // Write a key every 10 seconds
 | ||||
|   for (int i = 0; i < 200; i++) { | ||||
|     ASSERT_OK(Put(Key(i), "value")); | ||||
|     dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|         [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); }); | ||||
|   } | ||||
|   ASSERT_OK(Flush()); | ||||
|   TablePropertiesCollection tables_props; | ||||
|   ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); | ||||
|   ASSERT_EQ(tables_props.size(), 1); | ||||
|   auto it = tables_props.begin(); | ||||
|   SeqnoToTimeMapping tp_mapping; | ||||
|   ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); | ||||
|   ASSERT_OK(tp_mapping.Sort()); | ||||
|   ASSERT_FALSE(tp_mapping.Empty()); | ||||
|   auto seqs = tp_mapping.TEST_GetInternalMapping(); | ||||
|   ASSERT_GE(seqs.size(), 19); | ||||
|   ASSERT_LE(seqs.size(), 21); | ||||
|   SequenceNumber seq_end = dbfull()->GetLatestSequenceNumber(); | ||||
|   for (auto i = start_seq; i < start_seq + 10; i++) { | ||||
|     ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i + 1) * 10); | ||||
|   } | ||||
|   start_seq += 10; | ||||
|   for (auto i = start_seq; i < seq_end; i++) { | ||||
|     // The result is within the range
 | ||||
|     ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), (i - 10) * 10); | ||||
|     ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i + 10) * 10); | ||||
|   } | ||||
|   checked_file_nums.insert(it->second->orig_file_number); | ||||
|   start_seq = seq_end; | ||||
| 
 | ||||
|   // Write a key every 1 seconds
 | ||||
|   for (int i = 0; i < 200; i++) { | ||||
|     ASSERT_OK(Put(Key(i + 190), "value")); | ||||
|     dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|         [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1)); }); | ||||
|   } | ||||
|   seq_end = dbfull()->GetLatestSequenceNumber(); | ||||
|   ASSERT_OK(Flush()); | ||||
|   tables_props.clear(); | ||||
|   ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); | ||||
|   ASSERT_EQ(tables_props.size(), 2); | ||||
|   it = tables_props.begin(); | ||||
|   while (it != tables_props.end()) { | ||||
|     if (!checked_file_nums.count(it->second->orig_file_number)) { | ||||
|       break; | ||||
|     } | ||||
|     it++; | ||||
|   } | ||||
|   ASSERT_TRUE(it != tables_props.end()); | ||||
| 
 | ||||
|   tp_mapping.Clear(); | ||||
|   ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); | ||||
|   ASSERT_OK(tp_mapping.Sort()); | ||||
|   seqs = tp_mapping.TEST_GetInternalMapping(); | ||||
|   // There only a few time sample
 | ||||
|   ASSERT_GE(seqs.size(), 1); | ||||
|   ASSERT_LE(seqs.size(), 3); | ||||
|   for (auto i = start_seq; i < seq_end; i++) { | ||||
|     // The result is not very accurate, as there is more data write within small
 | ||||
|     // range of time
 | ||||
|     ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 1000); | ||||
|     ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 3000); | ||||
|   } | ||||
|   checked_file_nums.insert(it->second->orig_file_number); | ||||
|   start_seq = seq_end; | ||||
| 
 | ||||
|   // Write a key every 200 seconds
 | ||||
|   for (int i = 0; i < 200; i++) { | ||||
|     ASSERT_OK(Put(Key(i + 380), "value")); | ||||
|     dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|         [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); }); | ||||
|   } | ||||
|   seq_end = dbfull()->GetLatestSequenceNumber(); | ||||
|   ASSERT_OK(Flush()); | ||||
|   tables_props.clear(); | ||||
|   ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); | ||||
|   ASSERT_EQ(tables_props.size(), 3); | ||||
|   it = tables_props.begin(); | ||||
|   while (it != tables_props.end()) { | ||||
|     if (!checked_file_nums.count(it->second->orig_file_number)) { | ||||
|       break; | ||||
|     } | ||||
|     it++; | ||||
|   } | ||||
|   ASSERT_TRUE(it != tables_props.end()); | ||||
| 
 | ||||
|   tp_mapping.Clear(); | ||||
|   ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); | ||||
|   ASSERT_OK(tp_mapping.Sort()); | ||||
|   seqs = tp_mapping.TEST_GetInternalMapping(); | ||||
|   // The sequence number -> time entries should be maxed
 | ||||
|   ASSERT_GE(seqs.size(), 99); | ||||
|   ASSERT_LE(seqs.size(), 101); | ||||
|   for (auto i = start_seq; i < seq_end - 99; i++) { | ||||
|     // likely the first 100 entries reports 0
 | ||||
|     ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 3000); | ||||
|   } | ||||
|   start_seq += 101; | ||||
| 
 | ||||
|   for (auto i = start_seq; i < seq_end; i++) { | ||||
|     ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), | ||||
|               (i - start_seq) * 200 + 22200); | ||||
|     ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), | ||||
|               (i - start_seq) * 200 + 22600); | ||||
|   } | ||||
|   checked_file_nums.insert(it->second->orig_file_number); | ||||
|   start_seq = seq_end; | ||||
| 
 | ||||
|   // Write a key every 100 seconds
 | ||||
|   for (int i = 0; i < 200; i++) { | ||||
|     ASSERT_OK(Put(Key(i + 570), "value")); | ||||
|     dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|         [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); }); | ||||
|   } | ||||
|   seq_end = dbfull()->GetLatestSequenceNumber(); | ||||
|   ASSERT_OK(Flush()); | ||||
|   tables_props.clear(); | ||||
|   ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); | ||||
|   ASSERT_EQ(tables_props.size(), 4); | ||||
|   it = tables_props.begin(); | ||||
|   while (it != tables_props.end()) { | ||||
|     if (!checked_file_nums.count(it->second->orig_file_number)) { | ||||
|       break; | ||||
|     } | ||||
|     it++; | ||||
|   } | ||||
|   ASSERT_TRUE(it != tables_props.end()); | ||||
|   tp_mapping.Clear(); | ||||
|   ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); | ||||
|   ASSERT_OK(tp_mapping.Sort()); | ||||
|   seqs = tp_mapping.TEST_GetInternalMapping(); | ||||
|   ASSERT_GE(seqs.size(), 99); | ||||
|   ASSERT_LE(seqs.size(), 101); | ||||
| 
 | ||||
|   checked_file_nums.insert(it->second->orig_file_number); | ||||
| 
 | ||||
|   // re-enable compaction
 | ||||
|   ASSERT_OK(dbfull()->SetOptions({ | ||||
|       {"disable_auto_compactions", "false"}, | ||||
|   })); | ||||
| 
 | ||||
|   ASSERT_OK(dbfull()->TEST_WaitForCompact()); | ||||
| 
 | ||||
|   tables_props.clear(); | ||||
|   ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); | ||||
|   ASSERT_GE(tables_props.size(), 1); | ||||
|   it = tables_props.begin(); | ||||
|   while (it != tables_props.end()) { | ||||
|     if (!checked_file_nums.count(it->second->orig_file_number)) { | ||||
|       break; | ||||
|     } | ||||
|     it++; | ||||
|   } | ||||
|   ASSERT_TRUE(it != tables_props.end()); | ||||
|   tp_mapping.Clear(); | ||||
|   ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); | ||||
|   ASSERT_OK(tp_mapping.Sort()); | ||||
|   seqs = tp_mapping.TEST_GetInternalMapping(); | ||||
|   ASSERT_GE(seqs.size(), 99); | ||||
|   ASSERT_LE(seqs.size(), 101); | ||||
|   for (auto i = start_seq; i < seq_end - 99; i++) { | ||||
|     // likely the first 100 entries reports 0
 | ||||
|     ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 3000); | ||||
|   } | ||||
|   start_seq += 101; | ||||
| 
 | ||||
|   for (auto i = start_seq; i < seq_end; i++) { | ||||
|     ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), | ||||
|               (i - start_seq) * 100 + 52200); | ||||
|     ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), | ||||
|               (i - start_seq) * 100 + 52400); | ||||
|   } | ||||
|   ASSERT_OK(db_->Close()); | ||||
| } | ||||
| 
 | ||||
| // TODO(zjay): Disabled, until New CF bug with preclude_last_level_data_seconds
 | ||||
| //  is fixed
 | ||||
| TEST_F(SeqnoTimeTest, DISABLED_MultiCFs) { | ||||
|   Options options = CurrentOptions(); | ||||
|   options.preclude_last_level_data_seconds = 0; | ||||
|   options.env = mock_env_.get(); | ||||
|   options.stats_dump_period_sec = 0; | ||||
|   options.stats_persist_period_sec = 0; | ||||
|   ReopenWithColumnFamilies({"default"}, options); | ||||
| 
 | ||||
|   auto scheduler = dbfull()->TEST_GetPeriodicWorkScheduler(); | ||||
|   ASSERT_FALSE(scheduler->TEST_HasValidTask( | ||||
|       dbfull(), PeriodicWorkTaskNames::kRecordSeqnoTime)); | ||||
| 
 | ||||
|   // Write some data and increase the current time
 | ||||
|   for (int i = 0; i < 200; i++) { | ||||
|     ASSERT_OK(Put(Key(i), "value")); | ||||
|     dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|         [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); }); | ||||
|   } | ||||
|   ASSERT_OK(Flush()); | ||||
|   TablePropertiesCollection tables_props; | ||||
|   ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); | ||||
|   ASSERT_EQ(tables_props.size(), 1); | ||||
|   auto it = tables_props.begin(); | ||||
|   ASSERT_TRUE(it->second->seqno_to_time_mapping.empty()); | ||||
| 
 | ||||
|   ASSERT_TRUE(dbfull()->TEST_GetSeqnoToTimeMapping().Empty()); | ||||
| 
 | ||||
|   Options options_1 = options; | ||||
|   options_1.preclude_last_level_data_seconds = 10000;  // 10k
 | ||||
|   CreateColumnFamilies({"one"}, options_1); | ||||
|   ASSERT_TRUE(scheduler->TEST_HasValidTask( | ||||
|       dbfull(), PeriodicWorkTaskNames::kRecordSeqnoTime)); | ||||
| 
 | ||||
|   // Write some data to the default CF (without preclude_last_level feature)
 | ||||
|   for (int i = 0; i < 200; i++) { | ||||
|     ASSERT_OK(Put(Key(i), "value")); | ||||
|     dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|         [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); }); | ||||
|   } | ||||
|   ASSERT_OK(Flush()); | ||||
| 
 | ||||
|   // in memory mapping won't increase because CFs with preclude_last_level
 | ||||
|   // feature doesn't have memtable
 | ||||
|   auto queue = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping(); | ||||
|   ASSERT_LT(queue.size(), 5); | ||||
| 
 | ||||
|   // Write some data to the CF one
 | ||||
|   for (int i = 0; i < 20; i++) { | ||||
|     ASSERT_OK(Put(1, Key(i), "value")); | ||||
|     dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|         [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); }); | ||||
|   } | ||||
|   ASSERT_OK(Flush(1)); | ||||
|   tables_props.clear(); | ||||
|   ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[1], &tables_props)); | ||||
|   ASSERT_EQ(tables_props.size(), 1); | ||||
|   it = tables_props.begin(); | ||||
|   SeqnoToTimeMapping tp_mapping; | ||||
|   ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); | ||||
|   ASSERT_OK(tp_mapping.Sort()); | ||||
|   ASSERT_FALSE(tp_mapping.Empty()); | ||||
|   auto seqs = tp_mapping.TEST_GetInternalMapping(); | ||||
|   ASSERT_GE(seqs.size(), 1); | ||||
|   ASSERT_LE(seqs.size(), 3); | ||||
| 
 | ||||
|   // Create one more CF with larger preclude_last_level time
 | ||||
|   Options options_2 = options; | ||||
|   options_2.preclude_last_level_data_seconds = 1000000;  // 1m
 | ||||
|   CreateColumnFamilies({"two"}, options_2); | ||||
| 
 | ||||
|   // Add more data to CF "two" to fill the in memory mapping
 | ||||
|   for (int i = 0; i < 2000; i++) { | ||||
|     ASSERT_OK(Put(2, Key(i), "value")); | ||||
|     dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|         [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); }); | ||||
|   } | ||||
|   seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping(); | ||||
|   ASSERT_GE(seqs.size(), 1000 - 1); | ||||
|   ASSERT_LE(seqs.size(), 1000 + 1); | ||||
| 
 | ||||
|   ASSERT_OK(Flush(2)); | ||||
|   tables_props.clear(); | ||||
|   ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props)); | ||||
|   ASSERT_EQ(tables_props.size(), 1); | ||||
|   it = tables_props.begin(); | ||||
|   tp_mapping.Clear(); | ||||
|   ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); | ||||
|   ASSERT_OK(tp_mapping.Sort()); | ||||
|   seqs = tp_mapping.TEST_GetInternalMapping(); | ||||
|   // the max encoded entries is 100
 | ||||
|   ASSERT_GE(seqs.size(), 100 - 1); | ||||
|   ASSERT_LE(seqs.size(), 100 + 1); | ||||
| 
 | ||||
|   // Write some data to default CF, as all memtable with preclude_last_level
 | ||||
|   // enabled have flushed, the in-memory seqno->time mapping should be cleared
 | ||||
|   for (int i = 0; i < 10; i++) { | ||||
|     ASSERT_OK(Put(0, Key(i), "value")); | ||||
|     dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|         [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); }); | ||||
|   } | ||||
|   seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping(); | ||||
|   ASSERT_LE(seqs.size(), 5); | ||||
|   ASSERT_OK(Flush(0)); | ||||
| 
 | ||||
|   // trigger compaction for CF "two" and make sure the compaction output has
 | ||||
|   // seqno_to_time_mapping
 | ||||
|   for (int j = 0; j < 3; j++) { | ||||
|     for (int i = 0; i < 200; i++) { | ||||
|       ASSERT_OK(Put(2, Key(i), "value")); | ||||
|       dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|           [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); }); | ||||
|     } | ||||
|     ASSERT_OK(Flush(2)); | ||||
|   } | ||||
|   ASSERT_OK(dbfull()->TEST_WaitForCompact()); | ||||
|   tables_props.clear(); | ||||
|   ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props)); | ||||
|   ASSERT_EQ(tables_props.size(), 1); | ||||
|   it = tables_props.begin(); | ||||
|   tp_mapping.Clear(); | ||||
|   ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); | ||||
|   ASSERT_OK(tp_mapping.Sort()); | ||||
|   seqs = tp_mapping.TEST_GetInternalMapping(); | ||||
|   ASSERT_GE(seqs.size(), 99); | ||||
|   ASSERT_LE(seqs.size(), 101); | ||||
| 
 | ||||
|   for (int j = 0; j < 2; j++) { | ||||
|     for (int i = 0; i < 200; i++) { | ||||
|       ASSERT_OK(Put(0, Key(i), "value")); | ||||
|       dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|           [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); }); | ||||
|     } | ||||
|     ASSERT_OK(Flush(0)); | ||||
|   } | ||||
|   ASSERT_OK(dbfull()->TEST_WaitForCompact()); | ||||
|   tables_props.clear(); | ||||
|   ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[0], &tables_props)); | ||||
|   ASSERT_EQ(tables_props.size(), 1); | ||||
|   it = tables_props.begin(); | ||||
|   ASSERT_TRUE(it->second->seqno_to_time_mapping.empty()); | ||||
| 
 | ||||
|   // Write some data to CF "two", but don't flush to accumulate
 | ||||
|   for (int i = 0; i < 1000; i++) { | ||||
|     ASSERT_OK(Put(2, Key(i), "value")); | ||||
|     dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|         [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); }); | ||||
|   } | ||||
|   ASSERT_GE( | ||||
|       dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(), | ||||
|       500); | ||||
|   // After dropping CF "one", the in-memory mapping will be change to only
 | ||||
|   // follow CF "two" options.
 | ||||
|   ASSERT_OK(db_->DropColumnFamily(handles_[1])); | ||||
|   ASSERT_LE( | ||||
|       dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(), | ||||
|       100 + 5); | ||||
| 
 | ||||
|   // After dropping CF "two", the in-memory mapping is also clear.
 | ||||
|   ASSERT_OK(db_->DropColumnFamily(handles_[2])); | ||||
|   ASSERT_EQ( | ||||
|       dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(), | ||||
|       0); | ||||
| 
 | ||||
|   // And the timer worker is stopped
 | ||||
|   ASSERT_FALSE(scheduler->TEST_HasValidTask( | ||||
|       dbfull(), PeriodicWorkTaskNames::kRecordSeqnoTime)); | ||||
|   Close(); | ||||
| } | ||||
| 
 | ||||
| TEST_F(SeqnoTimeTest, SeqnoToTimeMappingUniversal) { | ||||
|   Options options = CurrentOptions(); | ||||
|   options.compaction_style = kCompactionStyleUniversal; | ||||
|   options.preclude_last_level_data_seconds = 10000; | ||||
|   options.env = mock_env_.get(); | ||||
| 
 | ||||
|   DestroyAndReopen(options); | ||||
| 
 | ||||
|   for (int j = 0; j < 3; j++) { | ||||
|     for (int i = 0; i < 100; i++) { | ||||
|       ASSERT_OK(Put(Key(i), "value")); | ||||
|       dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|           [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); }); | ||||
|     } | ||||
|     ASSERT_OK(Flush()); | ||||
|   } | ||||
|   TablePropertiesCollection tables_props; | ||||
|   ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); | ||||
|   ASSERT_EQ(tables_props.size(), 3); | ||||
|   for (const auto& props : tables_props) { | ||||
|     ASSERT_FALSE(props.second->seqno_to_time_mapping.empty()); | ||||
|     SeqnoToTimeMapping tp_mapping; | ||||
|     ASSERT_OK(tp_mapping.Add(props.second->seqno_to_time_mapping)); | ||||
|     ASSERT_OK(tp_mapping.Sort()); | ||||
|     ASSERT_FALSE(tp_mapping.Empty()); | ||||
|     auto seqs = tp_mapping.TEST_GetInternalMapping(); | ||||
|     ASSERT_GE(seqs.size(), 10 - 1); | ||||
|     ASSERT_LE(seqs.size(), 10 + 1); | ||||
|   } | ||||
| 
 | ||||
|   // Trigger a compaction
 | ||||
|   for (int i = 0; i < 100; i++) { | ||||
|     ASSERT_OK(Put(Key(i), "value")); | ||||
|     dbfull()->TEST_WaitForPeridicWorkerRun( | ||||
|         [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); }); | ||||
|   } | ||||
|   ASSERT_OK(Flush()); | ||||
|   ASSERT_OK(dbfull()->TEST_WaitForCompact()); | ||||
|   tables_props.clear(); | ||||
|   ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); | ||||
|   ASSERT_EQ(tables_props.size(), 1); | ||||
| 
 | ||||
|   auto it = tables_props.begin(); | ||||
|   SeqnoToTimeMapping tp_mapping; | ||||
|   ASSERT_FALSE(it->second->seqno_to_time_mapping.empty()); | ||||
|   ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); | ||||
|   Close(); | ||||
| } | ||||
| 
 | ||||
| TEST_F(SeqnoTimeTest, MappingAppend) { | ||||
|   SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10); | ||||
| 
 | ||||
|   // ignore seqno == 0, as it may mean the seqno is zeroed out
 | ||||
|   ASSERT_FALSE(test.Append(0, 9)); | ||||
| 
 | ||||
|   ASSERT_TRUE(test.Append(3, 10)); | ||||
|   auto size = test.Size(); | ||||
|   // normal add
 | ||||
|   ASSERT_TRUE(test.Append(10, 11)); | ||||
|   size++; | ||||
|   ASSERT_EQ(size, test.Size()); | ||||
| 
 | ||||
|   // Append unsorted
 | ||||
|   ASSERT_FALSE(test.Append(8, 12)); | ||||
|   ASSERT_EQ(size, test.Size()); | ||||
| 
 | ||||
|   // Append with the same seqno, newer time will be accepted
 | ||||
|   ASSERT_TRUE(test.Append(10, 12)); | ||||
|   ASSERT_EQ(size, test.Size()); | ||||
|   // older time will be ignored
 | ||||
|   ASSERT_FALSE(test.Append(10, 9)); | ||||
|   ASSERT_EQ(size, test.Size()); | ||||
| 
 | ||||
|   // new seqno with old time will be ignored
 | ||||
|   ASSERT_FALSE(test.Append(12, 8)); | ||||
|   ASSERT_EQ(size, test.Size()); | ||||
| } | ||||
| 
 | ||||
| TEST_F(SeqnoTimeTest, GetOldestApproximateTime) { | ||||
|   SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10); | ||||
| 
 | ||||
|   ASSERT_EQ(test.GetOldestApproximateTime(10), kUnknownSeqnoTime); | ||||
| 
 | ||||
|   test.Append(3, 10); | ||||
| 
 | ||||
|   ASSERT_EQ(test.GetOldestApproximateTime(2), kUnknownSeqnoTime); | ||||
|   ASSERT_EQ(test.GetOldestApproximateTime(3), 10); | ||||
|   ASSERT_EQ(test.GetOldestApproximateTime(10), 10); | ||||
| 
 | ||||
|   test.Append(10, 100); | ||||
| 
 | ||||
|   test.Append(100, 1000); | ||||
|   ASSERT_EQ(test.GetOldestApproximateTime(10), 100); | ||||
|   ASSERT_EQ(test.GetOldestApproximateTime(40), 100); | ||||
|   ASSERT_EQ(test.GetOldestApproximateTime(111), 1000); | ||||
| } | ||||
| 
 | ||||
| TEST_F(SeqnoTimeTest, Sort) { | ||||
|   SeqnoToTimeMapping test; | ||||
| 
 | ||||
|   // single entry
 | ||||
|   test.Add(10, 11); | ||||
|   ASSERT_OK(test.Sort()); | ||||
|   ASSERT_EQ(test.Size(), 1); | ||||
| 
 | ||||
|   // duplicate, should be removed by sort
 | ||||
|   test.Add(10, 11); | ||||
|   // same seqno, but older time, should be removed
 | ||||
|   test.Add(10, 9); | ||||
| 
 | ||||
|   // unuseful ones, should be removed by sort
 | ||||
|   test.Add(11, 9); | ||||
|   test.Add(9, 8); | ||||
| 
 | ||||
|   // Good ones
 | ||||
|   test.Add(1, 10); | ||||
|   test.Add(100, 100); | ||||
| 
 | ||||
|   ASSERT_OK(test.Sort()); | ||||
| 
 | ||||
|   auto seqs = test.TEST_GetInternalMapping(); | ||||
| 
 | ||||
|   std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected; | ||||
|   expected.emplace_back(1, 10); | ||||
|   expected.emplace_back(10, 11); | ||||
|   expected.emplace_back(100, 100); | ||||
| 
 | ||||
|   ASSERT_EQ(expected, seqs); | ||||
| } | ||||
| 
 | ||||
| TEST_F(SeqnoTimeTest, EncodeDecodeBasic) { | ||||
|   SeqnoToTimeMapping test(0, 1000); | ||||
| 
 | ||||
|   std::string output; | ||||
|   test.Encode(output, 0, 1000, 100); | ||||
|   ASSERT_TRUE(output.empty()); | ||||
| 
 | ||||
|   for (int i = 1; i <= 1000; i++) { | ||||
|     ASSERT_TRUE(test.Append(i, i * 10)); | ||||
|   } | ||||
|   test.Encode(output, 0, 1000, 100); | ||||
| 
 | ||||
|   ASSERT_FALSE(output.empty()); | ||||
| 
 | ||||
|   SeqnoToTimeMapping decoded; | ||||
|   ASSERT_OK(decoded.Add(output)); | ||||
|   ASSERT_OK(decoded.Sort()); | ||||
|   ASSERT_EQ(decoded.Size(), SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST); | ||||
|   ASSERT_EQ(test.Size(), 1000); | ||||
| 
 | ||||
|   for (SequenceNumber seq = 0; seq <= 1000; seq++) { | ||||
|     // test has the more accurate time mapping, encode only pick
 | ||||
|     // kMaxSeqnoTimePairsPerSST number of entries, which is less accurate
 | ||||
|     uint64_t target_time = test.GetOldestApproximateTime(seq); | ||||
|     ASSERT_GE(decoded.GetOldestApproximateTime(seq), | ||||
|               target_time < 200 ? 0 : target_time - 200); | ||||
|     ASSERT_LE(decoded.GetOldestApproximateTime(seq), target_time); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| TEST_F(SeqnoTimeTest, EncodeDecodePerferNewTime) { | ||||
|   SeqnoToTimeMapping test(0, 10); | ||||
| 
 | ||||
|   test.Append(1, 10); | ||||
|   test.Append(5, 17); | ||||
|   test.Append(6, 25); | ||||
|   test.Append(8, 30); | ||||
| 
 | ||||
|   std::string output; | ||||
|   test.Encode(output, 1, 10, 0, 3); | ||||
| 
 | ||||
|   SeqnoToTimeMapping decoded; | ||||
|   ASSERT_OK(decoded.Add(output)); | ||||
|   ASSERT_OK(decoded.Sort()); | ||||
| 
 | ||||
|   ASSERT_EQ(decoded.Size(), 3); | ||||
| 
 | ||||
|   auto seqs = decoded.TEST_GetInternalMapping(); | ||||
|   std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected; | ||||
|   expected.emplace_back(1, 10); | ||||
|   expected.emplace_back(6, 25); | ||||
|   expected.emplace_back(8, 30); | ||||
|   ASSERT_EQ(expected, seqs); | ||||
| 
 | ||||
|   // Add a few large time number
 | ||||
|   test.Append(10, 100); | ||||
|   test.Append(13, 200); | ||||
|   test.Append(16, 300); | ||||
| 
 | ||||
|   output.clear(); | ||||
|   test.Encode(output, 1, 20, 0, 4); | ||||
|   decoded.Clear(); | ||||
|   ASSERT_OK(decoded.Add(output)); | ||||
|   ASSERT_OK(decoded.Sort()); | ||||
|   ASSERT_EQ(decoded.Size(), 4); | ||||
| 
 | ||||
|   expected.clear(); | ||||
|   expected.emplace_back(1, 10); | ||||
|   // entry #6, #8 are skipped as they are too close to #1.
 | ||||
|   // entry #100 is also within skip range, but if it's skipped, there not enough
 | ||||
|   // number to fill 4 entries, so select it.
 | ||||
|   expected.emplace_back(10, 100); | ||||
|   expected.emplace_back(13, 200); | ||||
|   expected.emplace_back(16, 300); | ||||
|   seqs = decoded.TEST_GetInternalMapping(); | ||||
|   ASSERT_EQ(expected, seqs); | ||||
| } | ||||
| 
 | ||||
| }  // namespace ROCKSDB_NAMESPACE
 | ||||
| 
 | ||||
| #endif  // ROCKSDB_LITE
 | ||||
| 
 | ||||
| int main(int argc, char** argv) { | ||||
|   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); | ||||
|   ::testing::InitGoogleTest(&argc, argv); | ||||
|   return RUN_ALL_TESTS(); | ||||
| } | ||||
| @ -0,0 +1,320 @@ | ||||
| //  Copyright (c) Meta Platforms, Inc. and affiliates.
 | ||||
| //
 | ||||
| //  This source code is licensed under both the GPLv2 (found in the
 | ||||
| //  COPYING file in the root directory) and Apache 2.0 License
 | ||||
| //  (found in the LICENSE.Apache file in the root directory).
 | ||||
| 
 | ||||
| #include "db/seqno_to_time_mapping.h" | ||||
| 
 | ||||
| #include "db/version_edit.h" | ||||
| #include "util/string_util.h" | ||||
| 
 | ||||
| namespace ROCKSDB_NAMESPACE { | ||||
| 
 | ||||
| uint64_t SeqnoToTimeMapping::GetOldestApproximateTime( | ||||
|     const SequenceNumber seqno) const { | ||||
|   assert(is_sorted_); | ||||
|   auto it = std::upper_bound(seqno_time_mapping_.begin(), | ||||
|                              seqno_time_mapping_.end(), seqno); | ||||
|   if (it == seqno_time_mapping_.begin()) { | ||||
|     return 0; | ||||
|   } | ||||
|   it--; | ||||
|   return it->time; | ||||
| } | ||||
| 
 | ||||
| void SeqnoToTimeMapping::Add(SequenceNumber seqno, uint64_t time) { | ||||
|   if (seqno == 0) { | ||||
|     return; | ||||
|   } | ||||
|   is_sorted_ = false; | ||||
|   seqno_time_mapping_.emplace_back(seqno, time); | ||||
| } | ||||
| 
 | ||||
| SequenceNumber SeqnoToTimeMapping::TruncateOldEntries(const uint64_t now) { | ||||
|   assert(is_sorted_); | ||||
| 
 | ||||
|   if (max_time_duration_ == 0) { | ||||
|     return 0; | ||||
|   } | ||||
| 
 | ||||
|   const uint64_t cut_off_time = | ||||
|       now > max_time_duration_ ? now - max_time_duration_ : 0; | ||||
|   assert(cut_off_time < now);  // no overflow
 | ||||
| 
 | ||||
|   auto it = std::upper_bound( | ||||
|       seqno_time_mapping_.begin(), seqno_time_mapping_.end(), cut_off_time, | ||||
|       [](uint64_t target, const SeqnoTimePair& other) -> bool { | ||||
|         return target < other.time; | ||||
|       }); | ||||
|   if (it == seqno_time_mapping_.begin()) { | ||||
|     return 0; | ||||
|   } | ||||
|   it--; | ||||
|   seqno_time_mapping_.erase(seqno_time_mapping_.begin(), it); | ||||
| 
 | ||||
|   return seqno_time_mapping_.front().seqno; | ||||
| } | ||||
| 
 | ||||
| // The encoded format is:
 | ||||
| //  [num_of_entries][[seqno][time],[seqno][time],...]
 | ||||
| //      ^                                 ^
 | ||||
| //    var_int                      delta_encoded (var_int)
 | ||||
| void SeqnoToTimeMapping::Encode(std::string& dest, const SequenceNumber start, | ||||
|                                 const SequenceNumber end, const uint64_t now, | ||||
|                                 const uint64_t output_size) const { | ||||
|   assert(is_sorted_); | ||||
|   if (start > end) { | ||||
|     // It could happen when the SST file is empty, the initial value of min
 | ||||
|     // sequence number is kMaxSequenceNumber and max is 0.
 | ||||
|     // The empty output file will be removed in the final step of compaction.
 | ||||
|     return; | ||||
|   } | ||||
| 
 | ||||
|   auto start_it = std::upper_bound(seqno_time_mapping_.begin(), | ||||
|                                    seqno_time_mapping_.end(), start); | ||||
|   if (start_it != seqno_time_mapping_.begin()) { | ||||
|     start_it--; | ||||
|   } | ||||
| 
 | ||||
|   auto end_it = std::upper_bound(seqno_time_mapping_.begin(), | ||||
|                                  seqno_time_mapping_.end(), end); | ||||
|   if (end_it == seqno_time_mapping_.begin()) { | ||||
|     return; | ||||
|   } | ||||
|   if (start_it >= end_it) { | ||||
|     return; | ||||
|   } | ||||
| 
 | ||||
|   // truncate old entries that are not needed
 | ||||
|   if (max_time_duration_ > 0) { | ||||
|     const uint64_t cut_off_time = | ||||
|         now > max_time_duration_ ? now - max_time_duration_ : 0; | ||||
|     while (start_it < end_it && start_it->time < cut_off_time) { | ||||
|       start_it++; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   // If there are more data than needed, pick the entries for encoding.
 | ||||
|   // It's not the most optimized algorithm for selecting the best representative
 | ||||
|   // entries over the time.
 | ||||
|   // It starts from the beginning and makes sure the distance is larger than
 | ||||
|   // `(end - start) / size` before selecting the number. For example, for the
 | ||||
|   // following list, pick 3 entries (it will pick seqno #1, #6, #8):
 | ||||
|   //    1 -> 10
 | ||||
|   //    5 -> 17
 | ||||
|   //    6 -> 25
 | ||||
|   //    8 -> 30
 | ||||
|   // first, it always picks the first one, then there are 2 num_entries_to_fill
 | ||||
|   // and the time difference between current one vs. the last one is
 | ||||
|   // (30 - 10) = 20. 20/2 = 10. So it will skip until 10+10 = 20. => it skips
 | ||||
|   // #5 and pick #6.
 | ||||
|   // But the most optimized solution is picking #1 #5 #8, as it will be more
 | ||||
|   // evenly distributed for time. Anyway the following algorithm is simple and
 | ||||
|   // may over-select new data, which is good. We do want more accurate time
 | ||||
|   // information for recent data.
 | ||||
|   std::deque<SeqnoTimePair> output_copy; | ||||
|   if (std::distance(start_it, end_it) > static_cast<int64_t>(output_size)) { | ||||
|     int64_t num_entries_to_fill = static_cast<int64_t>(output_size); | ||||
|     auto last_it = end_it; | ||||
|     last_it--; | ||||
|     uint64_t end_time = last_it->time; | ||||
|     uint64_t skip_until_time = 0; | ||||
|     for (auto it = start_it; it < end_it; it++) { | ||||
|       // skip if it's not reach the skip_until_time yet
 | ||||
|       if (std::distance(it, end_it) > num_entries_to_fill && | ||||
|           it->time < skip_until_time) { | ||||
|         continue; | ||||
|       } | ||||
|       output_copy.push_back(*it); | ||||
|       num_entries_to_fill--; | ||||
|       if (std::distance(it, end_it) > num_entries_to_fill && | ||||
|           num_entries_to_fill > 0) { | ||||
|         // If there are more entries than we need, re-calculate the
 | ||||
|         // skip_until_time, which means skip until that time
 | ||||
|         skip_until_time = | ||||
|             it->time + ((end_time - it->time) / num_entries_to_fill); | ||||
|       } | ||||
|     } | ||||
| 
 | ||||
|     // Make sure all entries are filled
 | ||||
|     assert(num_entries_to_fill == 0); | ||||
|     start_it = output_copy.begin(); | ||||
|     end_it = output_copy.end(); | ||||
|   } | ||||
| 
 | ||||
|   // Delta encode the data
 | ||||
|   uint64_t size = std::distance(start_it, end_it); | ||||
|   PutVarint64(&dest, size); | ||||
|   SeqnoTimePair base; | ||||
|   for (auto it = start_it; it < end_it; it++) { | ||||
|     assert(base < *it); | ||||
|     SeqnoTimePair val = *it - base; | ||||
|     base = *it; | ||||
|     val.Encode(dest); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| Status SeqnoToTimeMapping::Add(const std::string& seqno_time_mapping_str) { | ||||
|   Slice input(seqno_time_mapping_str); | ||||
|   if (input.empty()) { | ||||
|     return Status::OK(); | ||||
|   } | ||||
|   uint64_t size; | ||||
|   if (!GetVarint64(&input, &size)) { | ||||
|     return Status::Corruption("Invalid sequence number time size"); | ||||
|   } | ||||
|   is_sorted_ = false; | ||||
|   SeqnoTimePair base; | ||||
|   for (uint64_t i = 0; i < size; i++) { | ||||
|     SeqnoTimePair val; | ||||
|     Status s = val.Decode(input); | ||||
|     if (!s.ok()) { | ||||
|       return s; | ||||
|     } | ||||
|     val.Add(base); | ||||
|     seqno_time_mapping_.emplace_back(val); | ||||
|     base = val; | ||||
|   } | ||||
|   return Status::OK(); | ||||
| } | ||||
| 
 | ||||
| void SeqnoToTimeMapping::SeqnoTimePair::Encode(std::string& dest) const { | ||||
|   PutVarint64Varint64(&dest, seqno, time); | ||||
| } | ||||
| 
 | ||||
| Status SeqnoToTimeMapping::SeqnoTimePair::Decode(Slice& input) { | ||||
|   if (!GetVarint64(&input, &seqno)) { | ||||
|     return Status::Corruption("Invalid sequence number"); | ||||
|   } | ||||
|   if (!GetVarint64(&input, &time)) { | ||||
|     return Status::Corruption("Invalid time"); | ||||
|   } | ||||
|   return Status::OK(); | ||||
| } | ||||
| 
 | ||||
| bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) { | ||||
|   assert(is_sorted_); | ||||
| 
 | ||||
|   // skip seq number 0, which may have special meaning, like zeroed out data
 | ||||
|   if (seqno == 0) { | ||||
|     return false; | ||||
|   } | ||||
|   if (!Empty()) { | ||||
|     if (seqno < Last().seqno || time < Last().time) { | ||||
|       return false; | ||||
|     } | ||||
|     if (seqno == Last().seqno) { | ||||
|       Last().time = time; | ||||
|       return true; | ||||
|     } | ||||
|     if (time == Last().time) { | ||||
|       // new sequence has the same time as old one, no need to add new mapping
 | ||||
|       return false; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   seqno_time_mapping_.emplace_back(seqno, time); | ||||
| 
 | ||||
|   if (seqno_time_mapping_.size() > max_capacity_) { | ||||
|     seqno_time_mapping_.pop_front(); | ||||
|   } | ||||
|   return true; | ||||
| } | ||||
| 
 | ||||
| bool SeqnoToTimeMapping::Resize(uint64_t min_time_duration, | ||||
|                                 uint64_t max_time_duration) { | ||||
|   uint64_t new_max_capacity = | ||||
|       CalculateMaxCapacity(min_time_duration, max_time_duration); | ||||
|   if (new_max_capacity == max_capacity_) { | ||||
|     return false; | ||||
|   } else if (new_max_capacity < seqno_time_mapping_.size()) { | ||||
|     uint64_t delta = seqno_time_mapping_.size() - new_max_capacity; | ||||
|     seqno_time_mapping_.erase(seqno_time_mapping_.begin(), | ||||
|                               seqno_time_mapping_.begin() + delta); | ||||
|   } | ||||
|   max_capacity_ = new_max_capacity; | ||||
|   return true; | ||||
| } | ||||
| 
 | ||||
| Status SeqnoToTimeMapping::Sort() { | ||||
|   if (is_sorted_ || seqno_time_mapping_.empty()) { | ||||
|     return Status::OK(); | ||||
|   } | ||||
| 
 | ||||
|   std::deque<SeqnoTimePair> copy = std::move(seqno_time_mapping_); | ||||
| 
 | ||||
|   std::sort(copy.begin(), copy.end()); | ||||
| 
 | ||||
|   seqno_time_mapping_.clear(); | ||||
| 
 | ||||
|   // remove seqno = 0, which may have special meaning, like zeroed out data
 | ||||
|   while (copy.front().seqno == 0) { | ||||
|     copy.pop_front(); | ||||
|   } | ||||
| 
 | ||||
|   SeqnoTimePair prev = copy.front(); | ||||
|   for (const auto& it : copy) { | ||||
|     // If sequence number is the same, pick the one with larger time, which is
 | ||||
|     // more accurate than the older time.
 | ||||
|     if (it.seqno == prev.seqno) { | ||||
|       assert(it.time >= prev.time); | ||||
|       prev.time = it.time; | ||||
|     } else { | ||||
|       assert(it.seqno > prev.seqno); | ||||
|       // If a larger sequence number has an older time which is not useful, skip
 | ||||
|       if (it.time > prev.time) { | ||||
|         seqno_time_mapping_.push_back(prev); | ||||
|         prev = it; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|   seqno_time_mapping_.emplace_back(prev); | ||||
| 
 | ||||
|   is_sorted_ = true; | ||||
|   return Status::OK(); | ||||
| } | ||||
| 
 | ||||
| std::string SeqnoToTimeMapping::ToHumanString() const { | ||||
|   std::string ret; | ||||
|   for (const auto& seq_time : seqno_time_mapping_) { | ||||
|     AppendNumberTo(&ret, seq_time.seqno); | ||||
|     ret.append("->"); | ||||
|     AppendNumberTo(&ret, seq_time.time); | ||||
|     ret.append(","); | ||||
|   } | ||||
|   return ret; | ||||
| } | ||||
| 
 | ||||
| SeqnoToTimeMapping SeqnoToTimeMapping::Copy( | ||||
|     SequenceNumber smallest_seqno) const { | ||||
|   SeqnoToTimeMapping ret; | ||||
|   auto it = std::upper_bound(seqno_time_mapping_.begin(), | ||||
|                              seqno_time_mapping_.end(), smallest_seqno); | ||||
|   if (it != seqno_time_mapping_.begin()) { | ||||
|     it--; | ||||
|   } | ||||
|   std::copy(it, seqno_time_mapping_.end(), | ||||
|             std::back_inserter(ret.seqno_time_mapping_)); | ||||
|   return ret; | ||||
| } | ||||
| 
 | ||||
| uint64_t SeqnoToTimeMapping::CalculateMaxCapacity(uint64_t min_time_duration, | ||||
|                                                   uint64_t max_time_duration) { | ||||
|   if (min_time_duration == 0) { | ||||
|     return 0; | ||||
|   } | ||||
|   return std::min( | ||||
|       kMaxSeqnoToTimeEntries, | ||||
|       max_time_duration * kMaxSeqnoTimePairsPerCF / min_time_duration); | ||||
| } | ||||
| 
 | ||||
| SeqnoToTimeMapping::SeqnoTimePair SeqnoToTimeMapping::SeqnoTimePair::operator-( | ||||
|     const SeqnoTimePair& other) const { | ||||
|   SeqnoTimePair res; | ||||
|   res.seqno = seqno - other.seqno; | ||||
|   res.time = time - other.time; | ||||
|   return res; | ||||
| } | ||||
| 
 | ||||
| }  // namespace ROCKSDB_NAMESPACE
 | ||||
| @ -0,0 +1,183 @@ | ||||
| //  Copyright (c) Meta Platforms, Inc. and affiliates.
 | ||||
| //
 | ||||
| //  This source code is licensed under both the GPLv2 (found in the
 | ||||
| //  COPYING file in the root directory) and Apache 2.0 License
 | ||||
| //  (found in the LICENSE.Apache file in the root directory).
 | ||||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include <algorithm> | ||||
| #include <cinttypes> | ||||
| #include <deque> | ||||
| #include <functional> | ||||
| #include <iterator> | ||||
| #include <string> | ||||
| 
 | ||||
| #include "rocksdb/status.h" | ||||
| #include "rocksdb/types.h" | ||||
| 
 | ||||
| namespace ROCKSDB_NAMESPACE { | ||||
| 
 | ||||
| constexpr uint64_t kUnknownSeqnoTime = 0; | ||||
| 
 | ||||
| // SeqnoToTimeMapping stores the sequence number to time mapping, so given a
 | ||||
| // sequence number it can estimate the oldest possible time for that sequence
 | ||||
| // number. For example:
 | ||||
| //   10 -> 100
 | ||||
| //   50 -> 300
 | ||||
| // then if a key has seqno 19, the OldestApproximateTime would be 100, for 51 it
 | ||||
| // would be 300.
 | ||||
| // As it's a sorted list, the new entry is inserted from the back. The old data
 | ||||
| // will be popped from the front if they're no longer used.
 | ||||
| class SeqnoToTimeMapping { | ||||
|  public: | ||||
|   // Maximum number of entries can be encoded into SST. The data is delta encode
 | ||||
|   // so the maximum data usage for each SST is < 0.3K
 | ||||
|   static constexpr uint64_t kMaxSeqnoTimePairsPerSST = 100; | ||||
| 
 | ||||
|   // Maximum number of entries per CF. If there's only CF with this feature on,
 | ||||
|   // the max duration divided by this number, so for example, if
 | ||||
|   // preclude_last_level_data_seconds = 100000 (~1day), then it will sample the
 | ||||
|   // seqno -> time every 1000 seconds (~17minutes). Then the maximum entry it
 | ||||
|   // needs is 100.
 | ||||
|   // When there are multiple CFs having this feature on, the sampling cadence is
 | ||||
|   // determined by the smallest setting, the capacity is determined the largest
 | ||||
|   // setting, also it's caped by kMaxSeqnoTimePairsPerCF * 10.
 | ||||
|   static constexpr uint64_t kMaxSeqnoTimePairsPerCF = 100; | ||||
| 
 | ||||
|   // A simple struct for sequence number to time pair
 | ||||
|   struct SeqnoTimePair { | ||||
|     SequenceNumber seqno = 0; | ||||
|     uint64_t time = 0; | ||||
| 
 | ||||
|     SeqnoTimePair() = default; | ||||
|     SeqnoTimePair(SequenceNumber _seqno, uint64_t _time) | ||||
|         : seqno(_seqno), time(_time) {} | ||||
| 
 | ||||
|     // Encode to dest string
 | ||||
|     void Encode(std::string& dest) const; | ||||
| 
 | ||||
|     // Decode the value from input Slice and remove it from the input
 | ||||
|     Status Decode(Slice& input); | ||||
| 
 | ||||
|     // subtraction of 2 SeqnoTimePair
 | ||||
|     SeqnoTimePair operator-(const SeqnoTimePair& other) const; | ||||
| 
 | ||||
|     // Add 2 values together
 | ||||
|     void Add(const SeqnoTimePair& obj) { | ||||
|       seqno += obj.seqno; | ||||
|       time += obj.time; | ||||
|     } | ||||
| 
 | ||||
|     // Compare SeqnoTimePair with a sequence number, used for binary search a
 | ||||
|     // sequence number in a list of SeqnoTimePair
 | ||||
|     bool operator<(const SequenceNumber& other) const { return seqno < other; } | ||||
| 
 | ||||
|     // Compare 2 SeqnoTimePair
 | ||||
|     bool operator<(const SeqnoTimePair& other) const { | ||||
|       return std::tie(seqno, time) < std::tie(other.seqno, other.time); | ||||
|     } | ||||
| 
 | ||||
|     // Check if 2 SeqnoTimePair is the same
 | ||||
|     bool operator==(const SeqnoTimePair& other) const { | ||||
|       return std::tie(seqno, time) == std::tie(other.seqno, other.time); | ||||
|     } | ||||
|   }; | ||||
| 
 | ||||
|   // constractor of SeqnoToTimeMapping
 | ||||
|   // max_time_duration is the maximum time it should track. For example, if
 | ||||
|   // preclude_last_level_data_seconds is 1 day, then if an entry is older than 1
 | ||||
|   // day, then it can be removed.
 | ||||
|   // max_capacity is the maximum number of entry it can hold. For single CF,
 | ||||
|   // it's caped at 100 (kMaxSeqnoTimePairsPerCF), otherwise
 | ||||
|   // kMaxSeqnoTimePairsPerCF * 10.
 | ||||
|   // If it's set to 0, means it won't truncate any old data.
 | ||||
|   explicit SeqnoToTimeMapping(uint64_t max_time_duration = 0, | ||||
|                               uint64_t max_capacity = 0) | ||||
|       : max_time_duration_(max_time_duration), max_capacity_(max_capacity) {} | ||||
| 
 | ||||
|   // Append a new entry to the list. The new entry should be newer than the
 | ||||
|   // existing ones. It maintains the internal sorted status.
 | ||||
|   bool Append(SequenceNumber seqno, uint64_t time); | ||||
| 
 | ||||
|   // Given a sequence number, estimate it's oldest time
 | ||||
|   uint64_t GetOldestApproximateTime(SequenceNumber seqno) const; | ||||
| 
 | ||||
|   // Truncate the old entries based on the current time and max_time_duration_
 | ||||
|   SequenceNumber TruncateOldEntries(uint64_t now); | ||||
| 
 | ||||
|   // Encode to a binary string
 | ||||
|   void Encode(std::string& des, SequenceNumber start, SequenceNumber end, | ||||
|               uint64_t now, | ||||
|               uint64_t output_size = kMaxSeqnoTimePairsPerSST) const; | ||||
| 
 | ||||
|   // Add a new random entry, unlike Append(), it can be any data, but also makes
 | ||||
|   // the list un-sorted.
 | ||||
|   void Add(SequenceNumber seqno, uint64_t time); | ||||
| 
 | ||||
|   // Decode and add the entries to the current obj. The list will be unsorted
 | ||||
|   Status Add(const std::string& seqno_time_mapping_str); | ||||
| 
 | ||||
|   // Return the number of entries
 | ||||
|   size_t Size() const { return seqno_time_mapping_.size(); } | ||||
| 
 | ||||
|   // Reduce the size of internal list
 | ||||
|   bool Resize(uint64_t min_time_duration, uint64_t max_time_duration); | ||||
| 
 | ||||
|   // Override the max_time_duration_
 | ||||
|   void SetMaxTimeDuration(uint64_t max_time_duration) { | ||||
|     max_time_duration_ = max_time_duration; | ||||
|   } | ||||
| 
 | ||||
|   uint64_t GetCapacity() const { return max_capacity_; } | ||||
| 
 | ||||
|   // Sort the list, which also remove the redundant entries, useless entries,
 | ||||
|   // which makes sure the seqno is sorted, but also the time
 | ||||
|   Status Sort(); | ||||
| 
 | ||||
|   // copy the current obj from the given smallest_seqno.
 | ||||
|   SeqnoToTimeMapping Copy(SequenceNumber smallest_seqno) const; | ||||
| 
 | ||||
|   // If the internal list is empty
 | ||||
|   bool Empty() const { return seqno_time_mapping_.empty(); } | ||||
| 
 | ||||
|   // clear all entries
 | ||||
|   void Clear() { seqno_time_mapping_.clear(); } | ||||
| 
 | ||||
|   // return the string for user message
 | ||||
|   // Note: Not efficient, okay for print
 | ||||
|   std::string ToHumanString() const; | ||||
| 
 | ||||
| #ifndef NDEBUG | ||||
|   const std::deque<SeqnoTimePair>& TEST_GetInternalMapping() const { | ||||
|     return seqno_time_mapping_; | ||||
|   } | ||||
| #endif | ||||
| 
 | ||||
|  private: | ||||
|   static constexpr uint64_t kMaxSeqnoToTimeEntries = | ||||
|       kMaxSeqnoTimePairsPerCF * 10; | ||||
| 
 | ||||
|   uint64_t max_time_duration_; | ||||
|   uint64_t max_capacity_; | ||||
| 
 | ||||
|   std::deque<SeqnoTimePair> seqno_time_mapping_; | ||||
| 
 | ||||
|   bool is_sorted_ = true; | ||||
| 
 | ||||
|   static uint64_t CalculateMaxCapacity(uint64_t min_time_duration, | ||||
|                                        uint64_t max_time_duration); | ||||
| 
 | ||||
|   SeqnoTimePair& Last() { | ||||
|     assert(!Empty()); | ||||
|     return seqno_time_mapping_.back(); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // for searching the sequence number from SeqnoToTimeMapping
 | ||||
| inline bool operator<(const SequenceNumber& seqno, | ||||
|                       const SeqnoToTimeMapping::SeqnoTimePair& other) { | ||||
|   return seqno < other.seqno; | ||||
| } | ||||
| 
 | ||||
| }  // namespace ROCKSDB_NAMESPACE
 | ||||
					Loading…
					
					
				
		Reference in new issue
	
	 Jay Zhuang
						Jay Zhuang