Tiered compaction: integrate Seqno time mapping with per key placement (#10370)

Summary:
Using the Sequence number to time mapping to decide if a key is hot or not in
compaction and place it in the corresponding level.

Note: the feature is not complete, level compaction will run indefinitely until
all penultimate level data is cold and small enough to not trigger compaction.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10370

Test Plan:
CI
* Run basic db_bench for universal compaction manually

Reviewed By: siying

Differential Revision: D37892338

Pulled By: jay-zhuang

fbshipit-source-id: 792bbd91b1ccc2f62b5d14c53118434bcaac4bbe
main
Jay Zhuang 3 years ago committed by Facebook GitHub Bot
parent 7506c1a4ca
commit faa0f9723c
  1. 13
      db/compaction/compaction.cc
  2. 20
      db/compaction/compaction_iterator.cc
  3. 11
      db/compaction/compaction_iterator.h
  4. 7
      db/compaction/compaction_job.cc
  5. 9
      db/compaction/compaction_job.h
  6. 30
      db/compaction/tiered_compaction_test.cc
  7. 11
      db/db_compaction_test.cc
  8. 2
      db/db_impl/db_impl.h
  9. 12
      db/db_test2.cc
  10. 9
      db/db_test_util.cc
  11. 2
      db/db_test_util.h
  12. 15
      db/event_helpers.cc
  13. 10
      db/external_sst_file_basic_test.cc
  14. 2
      db/flush_job.cc
  15. 246
      db/seqno_time_test.cc
  16. 8
      db/seqno_to_time_mapping.cc
  17. 3
      db/seqno_to_time_mapping.h

@ -440,6 +440,11 @@ bool Compaction::IsTrivialMove() const {
} }
} }
// PerKeyPlacement compaction should never be trivial move.
if (SupportsPerKeyPlacement()) {
return false;
}
return true; return true;
} }
@ -741,10 +746,10 @@ int Compaction::EvaluatePenultimateLevel(
return kInvalidLevel; return kInvalidLevel;
} }
// TODO: will add public like `options.preclude_last_level_data_seconds` for bool supports_per_key_placement =
// per_key_placement feature, will check that option here. Currently, only immutable_options.preclude_last_level_data_seconds > 0;
// set by unittest
bool supports_per_key_placement = false; // it could be overridden by unittest
TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled", TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled",
&supports_per_key_placement); &supports_per_key_placement);
if (!supports_per_key_placement) { if (!supports_per_key_placement) {

@ -34,7 +34,7 @@ CompactionIterator::CompactionIterator(
const std::atomic<bool>* shutting_down, const std::atomic<bool>* shutting_down,
const std::shared_ptr<Logger> info_log, const std::shared_ptr<Logger> info_log,
const std::string* full_history_ts_low, const std::string* full_history_ts_low,
const SequenceNumber max_seqno_allow_zero_out) const SequenceNumber penultimate_level_cutoff_seqno)
: CompactionIterator( : CompactionIterator(
input, cmp, merge_helper, last_sequence, snapshots, input, cmp, merge_helper, last_sequence, snapshots,
earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env, earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
@ -44,7 +44,7 @@ CompactionIterator::CompactionIterator(
std::unique_ptr<CompactionProxy>( std::unique_ptr<CompactionProxy>(
compaction ? new RealCompaction(compaction) : nullptr), compaction ? new RealCompaction(compaction) : nullptr),
compaction_filter, shutting_down, info_log, full_history_ts_low, compaction_filter, shutting_down, info_log, full_history_ts_low,
max_seqno_allow_zero_out) {} penultimate_level_cutoff_seqno) {}
CompactionIterator::CompactionIterator( CompactionIterator::CompactionIterator(
InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@ -61,7 +61,7 @@ CompactionIterator::CompactionIterator(
const std::atomic<bool>* shutting_down, const std::atomic<bool>* shutting_down,
const std::shared_ptr<Logger> info_log, const std::shared_ptr<Logger> info_log,
const std::string* full_history_ts_low, const std::string* full_history_ts_low,
const SequenceNumber max_seqno_allow_zero_out) const SequenceNumber penultimate_level_cutoff_seqno)
: input_(input, cmp, : input_(input, cmp,
!compaction || compaction->DoesInputReferenceBlobFiles()), !compaction || compaction->DoesInputReferenceBlobFiles()),
cmp_(cmp), cmp_(cmp),
@ -96,7 +96,7 @@ CompactionIterator::CompactionIterator(
current_key_committed_(false), current_key_committed_(false),
cmp_with_history_ts_low_(0), cmp_with_history_ts_low_(0),
level_(compaction_ == nullptr ? 0 : compaction_->level()), level_(compaction_ == nullptr ? 0 : compaction_->level()),
max_seqno_allow_zero_out_(max_seqno_allow_zero_out) { penultimate_level_cutoff_seqno_(penultimate_level_cutoff_seqno) {
assert(snapshots_ != nullptr); assert(snapshots_ != nullptr);
bottommost_level_ = compaction_ == nullptr bottommost_level_ = compaction_ == nullptr
? false ? false
@ -1081,8 +1081,7 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() {
void CompactionIterator::DecideOutputLevel() { void CompactionIterator::DecideOutputLevel() {
#ifndef NDEBUG #ifndef NDEBUG
// TODO: will be set by sequence number or key range, for now, it will only be // Could be overridden by unittest
// set by unittest
PerKeyPlacementContext context(level_, ikey_.user_key, value_, PerKeyPlacementContext context(level_, ikey_.user_key, value_,
ikey_.sequence); ikey_.sequence);
TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context", TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
@ -1090,9 +1089,10 @@ void CompactionIterator::DecideOutputLevel() {
output_to_penultimate_level_ = context.output_to_penultimate_level; output_to_penultimate_level_ = context.output_to_penultimate_level;
#endif /* !NDEBUG */ #endif /* !NDEBUG */
// if the key is within the earliest snapshot, it has to output to the // if the key is newer than the cutoff sequence or within the earliest
// penultimate level. // snapshot, it should output to the penultimate level.
if (ikey_.sequence > earliest_snapshot_) { if (ikey_.sequence > penultimate_level_cutoff_seqno_ ||
ikey_.sequence > earliest_snapshot_) {
output_to_penultimate_level_ = true; output_to_penultimate_level_ = true;
} }
@ -1153,7 +1153,7 @@ void CompactionIterator::PrepareOutput() {
DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
ikey_.type != kTypeMerge && current_key_committed_ && ikey_.type != kTypeMerge && current_key_committed_ &&
!output_to_penultimate_level_ && !output_to_penultimate_level_ &&
ikey_.sequence < max_seqno_allow_zero_out_) { ikey_.sequence < penultimate_level_cutoff_seqno_) {
if (ikey_.type == kTypeDeletion || if (ikey_.type == kTypeDeletion ||
(ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) { (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
ROCKS_LOG_FATAL( ROCKS_LOG_FATAL(

@ -196,7 +196,7 @@ class CompactionIterator {
const std::atomic<bool>* shutting_down = nullptr, const std::atomic<bool>* shutting_down = nullptr,
const std::shared_ptr<Logger> info_log = nullptr, const std::shared_ptr<Logger> info_log = nullptr,
const std::string* full_history_ts_low = nullptr, const std::string* full_history_ts_low = nullptr,
const SequenceNumber max_seqno_allow_zero_out = kMaxSequenceNumber); const SequenceNumber penultimate_level_cutoff_seqno = kMaxSequenceNumber);
// Constructor with custom CompactionProxy, used for tests. // Constructor with custom CompactionProxy, used for tests.
CompactionIterator( CompactionIterator(
@ -214,7 +214,7 @@ class CompactionIterator {
const std::atomic<bool>* shutting_down = nullptr, const std::atomic<bool>* shutting_down = nullptr,
const std::shared_ptr<Logger> info_log = nullptr, const std::shared_ptr<Logger> info_log = nullptr,
const std::string* full_history_ts_low = nullptr, const std::string* full_history_ts_low = nullptr,
const SequenceNumber max_seqno_allow_zero_out = kMaxSequenceNumber); const SequenceNumber penultimate_level_cutoff_seqno = kMaxSequenceNumber);
~CompactionIterator(); ~CompactionIterator();
@ -444,10 +444,9 @@ class CompactionIterator {
// output to. // output to.
bool output_to_penultimate_level_{false}; bool output_to_penultimate_level_{false};
// any key later than this sequence number, need to keep the sequence number // any key later than this sequence number should have
// and not zeroed out. The sequence number is kept to track it's approximate // output_to_penultimate_level_ set to true
// time. const SequenceNumber penultimate_level_cutoff_seqno_ = kMaxSequenceNumber;
const SequenceNumber max_seqno_allow_zero_out_ = kMaxSequenceNumber;
void AdvanceInputIter() { input_.Next(); } void AdvanceInputIter() { input_.Next(); }

@ -282,9 +282,9 @@ void CompactionJob::Prepare() {
ROCKS_LOG_WARN(db_options_.info_log, ROCKS_LOG_WARN(db_options_.info_log,
"Failed to get current time in compaction: Status: %s", "Failed to get current time in compaction: Status: %s",
status.ToString().c_str()); status.ToString().c_str());
max_seqno_allow_zero_out_ = 0; penultimate_level_cutoff_seqno_ = 0;
} else { } else {
max_seqno_allow_zero_out_ = penultimate_level_cutoff_seqno_ =
seqno_time_mapping_.TruncateOldEntries(_current_time); seqno_time_mapping_.TruncateOldEntries(_current_time);
} }
} }
@ -1026,7 +1026,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
blob_file_builder.get(), db_options_.allow_data_in_errors, blob_file_builder.get(), db_options_.allow_data_in_errors,
db_options_.enforce_single_del_contracts, manual_compaction_canceled_, db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
sub_compact->compaction, compaction_filter, shutting_down_, sub_compact->compaction, compaction_filter, shutting_down_,
db_options_.info_log, full_history_ts_low, max_seqno_allow_zero_out_); db_options_.info_log, full_history_ts_low,
penultimate_level_cutoff_seqno_);
c_iter->SeekToFirst(); c_iter->SeekToFirst();
// Assign range delete aggregator to the target output level, which makes sure // Assign range delete aggregator to the target output level, which makes sure

@ -304,9 +304,12 @@ class CompactionJob {
// it also collects the smallest_seqno -> oldest_ancester_time from the SST. // it also collects the smallest_seqno -> oldest_ancester_time from the SST.
SeqnoToTimeMapping seqno_time_mapping_; SeqnoToTimeMapping seqno_time_mapping_;
// If a sequence number larger than max_seqno_allow_zero_out_, it won't be // cutoff sequence number for penultimate level, only set when
// zeroed out. The sequence number is kept to get approximate time of the key. // per_key_placement feature is enabled.
SequenceNumber max_seqno_allow_zero_out_ = kMaxSequenceNumber; // If a key with sequence number larger than penultimate_level_cutoff_seqno_,
// it will be placed on the penultimate_level and seqnuence number won't be
// zeroed out.
SequenceNumber penultimate_level_cutoff_seqno_ = kMaxSequenceNumber;
// Get table file name in where it's outputting to, which should also be in // Get table file name in where it's outputting to, which should also be in
// `output_directory_`. // `output_directory_`.

@ -53,26 +53,17 @@ class TieredCompactionTest : public DBTestBase {
InternalStats::CompactionOutputsStats kBasicPerLevelStats; InternalStats::CompactionOutputsStats kBasicPerLevelStats;
InternalStats::CompactionStats kBasicFlushStats; InternalStats::CompactionStats kBasicFlushStats;
std::atomic_bool enable_per_key_placement = true;
void SetUp() override { void SetUp() override {
SyncPoint::GetInstance()->SetCallBack( SyncPoint::GetInstance()->SetCallBack(
"Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
auto supports_per_key_placement = static_cast<bool*>(arg); auto supports_per_key_placement = static_cast<bool*>(arg);
*supports_per_key_placement = true; *supports_per_key_placement = enable_per_key_placement;
}); });
SyncPoint::GetInstance()->EnableProcessing(); SyncPoint::GetInstance()->EnableProcessing();
} }
#ifndef ROCKSDB_LITE
uint64_t GetSstSizeHelper(Temperature temperature) {
std::string prop;
EXPECT_TRUE(dbfull()->GetProperty(
DB::Properties::kLiveSstFilesSizeAtTemperature +
std::to_string(static_cast<uint8_t>(temperature)),
&prop));
return static_cast<uint64_t>(std::atoi(prop.c_str()));
}
#endif // ROCKSDB_LITE
const std::vector<InternalStats::CompactionStats>& GetCompactionStats() { const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
VersionSet* const versions = dbfull()->GetVersionSet(); VersionSet* const versions = dbfull()->GetVersionSet();
assert(versions); assert(versions);
@ -1054,12 +1045,14 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
latest_cold_seq = seq_history[2];
MoveFilesToLevel(kLastLevel); MoveFilesToLevel(kLastLevel);
// move forward the cold_seq again with range delete, take a snapshot to keep // move forward the cold_seq again with range delete, take a snapshot to keep
// the range dels in bottommost // the range dels in bottommost
auto snap = db_->GetSnapshot(); auto snap = db_->GetSnapshot();
latest_cold_seq = seq_history[2];
std::string start = Key(25), end = Key(35); std::string start = Key(25), end = Key(35);
ASSERT_OK( ASSERT_OK(
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
@ -1104,9 +1097,12 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
db_->ReleaseSnapshot(snap); db_->ReleaseSnapshot(snap);
// TODO: it should push the data to last level, but penultimate level file is
// already bottommost, it's a conflict between bottommost_temperature and
// tiered compaction which only applies to last level compaction.
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
// 3 range dels dropped, the first one is double counted as expected, which is // 3 range dels dropped, the first one is double counted as expected, which is
@ -1123,8 +1119,8 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
// input range // input range
latest_cold_seq = seq_history[1]; latest_cold_seq = seq_history[1];
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
} }

@ -78,17 +78,6 @@ class DBCompactionTest : public DBTestBase {
: DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {} : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {}
protected: protected:
#ifndef ROCKSDB_LITE
uint64_t GetSstSizeHelper(Temperature temperature) {
std::string prop;
EXPECT_TRUE(dbfull()->GetProperty(
DB::Properties::kLiveSstFilesSizeAtTemperature +
std::to_string(static_cast<uint8_t>(temperature)),
&prop));
return static_cast<uint64_t>(std::atoi(prop.c_str()));
}
#endif // ROCKSDB_LITE
/* /*
* Verifies compaction stats of cfd are valid. * Verifies compaction stats of cfd are valid.
* *

@ -2594,6 +2594,8 @@ class DBImpl : public DB {
// Pointer to WriteBufferManager stalling interface. // Pointer to WriteBufferManager stalling interface.
std::unique_ptr<StallInterface> wbm_stall_; std::unique_ptr<StallInterface> wbm_stall_;
// seqno_time_mapping_ stores the sequence number to time mapping, it's not
// thread safe, both read and write need db mutex hold.
SeqnoToTimeMapping seqno_time_mapping_; SeqnoToTimeMapping seqno_time_mapping_;
}; };

@ -33,18 +33,6 @@ namespace ROCKSDB_NAMESPACE {
class DBTest2 : public DBTestBase { class DBTest2 : public DBTestBase {
public: public:
DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {} DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {}
protected:
#ifndef ROCKSDB_LITE
uint64_t GetSstSizeHelper(Temperature temperature) {
std::string prop;
EXPECT_TRUE(dbfull()->GetProperty(
DB::Properties::kLiveSstFilesSizeAtTemperature +
std::to_string(static_cast<uint8_t>(temperature)),
&prop));
return static_cast<uint64_t>(std::atoi(prop.c_str()));
}
#endif // ROCKSDB_LITE
}; };
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE

@ -1676,6 +1676,15 @@ uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily(
} }
return result; return result;
} }
uint64_t DBTestBase::GetSstSizeHelper(Temperature temperature) {
std::string prop;
EXPECT_TRUE(dbfull()->GetProperty(
DB::Properties::kLiveSstFilesSizeAtTemperature +
std::to_string(static_cast<uint8_t>(temperature)),
&prop));
return static_cast<uint64_t>(std::atoi(prop.c_str()));
}
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
void VerifySstUniqueIds(const TablePropertiesCollection& props) { void VerifySstUniqueIds(const TablePropertiesCollection& props) {

@ -1345,6 +1345,8 @@ class DBTestBase : public testing::Test {
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
uint64_t GetNumberOfSstFilesForColumnFamily(DB* db, uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
std::string column_family_name); std::string column_family_name);
uint64_t GetSstSizeHelper(Temperature temperature);
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {

@ -148,8 +148,19 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
<< table_properties.fast_compression_estimated_data_size << table_properties.fast_compression_estimated_data_size
<< "db_id" << table_properties.db_id << "db_session_id" << "db_id" << table_properties.db_id << "db_session_id"
<< table_properties.db_session_id << "orig_file_number" << table_properties.db_session_id << "orig_file_number"
<< table_properties.orig_file_number << "seqno_to_time_mapping" << table_properties.orig_file_number << "seqno_to_time_mapping";
<< table_properties.seqno_to_time_mapping;
if (table_properties.seqno_to_time_mapping.empty()) {
jwriter << "N/A";
} else {
SeqnoToTimeMapping tmp;
Status status = tmp.Add(table_properties.seqno_to_time_mapping);
if (status.ok()) {
jwriter << tmp.ToHumanString();
} else {
jwriter << "Invalid";
}
}
// user collected properties // user collected properties
for (const auto& prop : table_properties.readable_properties) { for (const auto& prop : table_properties.readable_properties) {

@ -187,16 +187,6 @@ class ExternalSSTFileBasicTest
std::string sst_files_dir_; std::string sst_files_dir_;
std::unique_ptr<FaultInjectionTestEnv> fault_injection_test_env_; std::unique_ptr<FaultInjectionTestEnv> fault_injection_test_env_;
bool random_rwfile_supported_; bool random_rwfile_supported_;
#ifndef ROCKSDB_LITE
uint64_t GetSstSizeHelper(Temperature temperature) {
std::string prop;
EXPECT_TRUE(dbfull()->GetProperty(
DB::Properties::kLiveSstFilesSizeAtTemperature +
std::to_string(static_cast<uint8_t>(temperature)),
&prop));
return static_cast<uint64_t>(std::atoi(prop.c_str()));
}
#endif // ROCKSDB_LITE
}; };
TEST_F(ExternalSSTFileBasicTest, Basic) { TEST_F(ExternalSSTFileBasicTest, Basic) {

@ -818,6 +818,8 @@ Status FlushJob::WriteLevel0Table() {
SequenceNumber smallest_seqno = mems_.front()->GetEarliestSequenceNumber(); SequenceNumber smallest_seqno = mems_.front()->GetEarliestSequenceNumber();
if (!db_impl_seqno_time_mapping_.Empty()) { if (!db_impl_seqno_time_mapping_.Empty()) {
// make a local copy, as the seqno_time_mapping from db_impl is not thread
// safe, which will be used while not holding the db_mutex.
seqno_to_time_mapping_ = db_impl_seqno_time_mapping_.Copy(smallest_seqno); seqno_to_time_mapping_ = db_impl_seqno_time_mapping_.Copy(smallest_seqno);
} }

@ -8,6 +8,7 @@
#include "db/periodic_work_scheduler.h" #include "db/periodic_work_scheduler.h"
#include "db/seqno_to_time_mapping.h" #include "db/seqno_to_time_mapping.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "rocksdb/iostats_context.h"
#include "test_util/mock_time_env.h" #include "test_util/mock_time_env.h"
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
@ -35,8 +36,253 @@ class SeqnoTimeTest : public DBTestBase {
PeriodicWorkTestScheduler::Default(mock_clock_); PeriodicWorkTestScheduler::Default(mock_clock_);
}); });
} }
// make sure the file is not in cache, otherwise it won't have IO info
void AssertKetTemperature(int key_id, Temperature expected_temperature) {
get_iostats_context()->Reset();
IOStatsContext* iostats = get_iostats_context();
std::string result = Get(Key(key_id));
ASSERT_FALSE(result.empty());
ASSERT_GT(iostats->bytes_read, 0);
switch (expected_temperature) {
case Temperature::kUnknown:
ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count,
0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
0);
break;
case Temperature::kCold:
ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_read_count,
0);
ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
0);
break;
default:
// the test only support kCold now for the bottommost temperature
FAIL();
}
}
}; };
TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) {
const int kNumTrigger = 4;
const int kNumLevels = 7;
const int kNumKeys = 100;
const int kKeyPerSec = 10;
Options options = CurrentOptions();
options.compaction_style = kCompactionStyleUniversal;
options.preclude_last_level_data_seconds = 10000;
options.env = mock_env_.get();
options.bottommost_temperature = Temperature::kCold;
options.num_levels = kNumLevels;
DestroyAndReopen(options);
// pass some time first, otherwise the first a few keys write time are going
// to be zero, and internally zero has special meaning: kUnknownSeqnoTime
dbfull()->TEST_WaitForPeridicWorkerRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
int sst_num = 0;
// Write files that are overlap and enough to trigger compaction
for (; sst_num < kNumTrigger; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeridicWorkerRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
});
}
ASSERT_OK(Flush());
}
ASSERT_OK(dbfull()->WaitForCompact(true));
// All data is hot, only output to penultimate level
ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
// read a random key, which should be hot (kUnknown)
AssertKetTemperature(20, Temperature::kUnknown);
// Write more data, but still all hot until the 10th SST, as:
// write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds
// The preclude_last_level_data_seconds is 10k
for (; sst_num < kNumTrigger * 2; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeridicWorkerRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
});
}
ASSERT_OK(Flush());
ASSERT_OK(dbfull()->WaitForCompact(true));
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
}
// Now we have both hot data and cold data
for (; sst_num < kNumTrigger * 3; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeridicWorkerRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
});
}
ASSERT_OK(Flush());
ASSERT_OK(dbfull()->WaitForCompact(true));
}
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
ASSERT_GT(hot_data_size, 0);
ASSERT_GT(cold_data_size, 0);
// the first a few key should be cold
AssertKetTemperature(20, Temperature::kCold);
// Wait some time, each time after compaction, the cold data size is
// increasing and hot data size is decreasing
for (int i = 0; i < 30; i++) {
dbfull()->TEST_WaitForPeridicWorkerRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(20 * kKeyPerSec));
});
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
uint64_t pre_hot = hot_data_size;
uint64_t pre_cold = cold_data_size;
hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
cold_data_size = GetSstSizeHelper(Temperature::kCold);
ASSERT_LT(hot_data_size, pre_hot);
ASSERT_GT(cold_data_size, pre_cold);
// the hot/cold data cut off range should be between i * 20 + 200 -> 250
AssertKetTemperature(i * 20 + 250, Temperature::kUnknown);
AssertKetTemperature(i * 20 + 200, Temperature::kCold);
}
// Wait again, all data should be cold after that
for (int i = 0; i < 5; i++) {
dbfull()->TEST_WaitForPeridicWorkerRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
}
ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
// any random data should be cold
AssertKetTemperature(1000, Temperature::kCold);
// close explicitly, because the env is local variable which will be released
// first.
Close();
}
TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
const int kNumLevels = 7;
const int kNumKeys = 100;
Options options = CurrentOptions();
options.preclude_last_level_data_seconds = 10000;
options.env = mock_env_.get();
options.bottommost_temperature = Temperature::kCold;
options.num_levels = kNumLevels;
options.level_compaction_dynamic_level_bytes = true;
// TODO(zjay): for level compaction, auto-compaction may stuck in deadloop, if
// the penultimate level score > 1, but the hot is not cold enough to compact
// to last level, which will keep triggering compaction.
options.disable_auto_compactions = true;
DestroyAndReopen(options);
// pass some time first, otherwise the first a few keys write time are going
// to be zero, and internally zero has special meaning: kUnknownSeqnoTime
dbfull()->TEST_WaitForPeridicWorkerRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
int sst_num = 0;
// Write files that are overlap
for (; sst_num < 4; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeridicWorkerRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
}
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
// All data is hot, only output to penultimate level
ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
// read a random key, which should be hot (kUnknown)
AssertKetTemperature(20, Temperature::kUnknown);
// Adding more data to have mixed hot and cold data
for (; sst_num < 14; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeridicWorkerRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
}
// TODO(zjay): all data become cold because of level 5 (penultimate level) is
// the bottommost level, which converts the data to cold. PerKeyPlacement is
// for the last level (level 6). Will be fixed by change the
// bottommost_temperature to the last_level_temperature
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
// Compact the files to the last level which should split the hot/cold data
MoveFilesToLevel(6);
uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
ASSERT_GT(hot_data_size, 0);
ASSERT_GT(cold_data_size, 0);
// the first a few key should be cold
AssertKetTemperature(20, Temperature::kCold);
// Wait some time, each it wait, the cold data is increasing and hot data is
// decreasing
for (int i = 0; i < 30; i++) {
dbfull()->TEST_WaitForPeridicWorkerRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); });
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
uint64_t pre_hot = hot_data_size;
uint64_t pre_cold = cold_data_size;
hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
cold_data_size = GetSstSizeHelper(Temperature::kCold);
ASSERT_LT(hot_data_size, pre_hot);
ASSERT_GT(cold_data_size, pre_cold);
// the hot/cold cut_off key should be around i * 20 + 400 -> 450
AssertKetTemperature(i * 20 + 450, Temperature::kUnknown);
AssertKetTemperature(i * 20 + 400, Temperature::kCold);
}
// Wait again, all data should be cold after that
for (int i = 0; i < 5; i++) {
dbfull()->TEST_WaitForPeridicWorkerRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
}
ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
// any random data should be cold
AssertKetTemperature(1000, Temperature::kCold);
Close();
}
TEST_F(SeqnoTimeTest, BasicSeqnoToTimeMapping) { TEST_F(SeqnoTimeTest, BasicSeqnoToTimeMapping) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.preclude_last_level_data_seconds = 10000; options.preclude_last_level_data_seconds = 10000;

@ -40,7 +40,7 @@ SequenceNumber SeqnoToTimeMapping::TruncateOldEntries(const uint64_t now) {
const uint64_t cut_off_time = const uint64_t cut_off_time =
now > max_time_duration_ ? now - max_time_duration_ : 0; now > max_time_duration_ ? now - max_time_duration_ : 0;
assert(cut_off_time < now); // no overflow assert(cut_off_time <= now); // no overflow
auto it = std::upper_bound( auto it = std::upper_bound(
seqno_time_mapping_.begin(), seqno_time_mapping_.end(), cut_off_time, seqno_time_mapping_.begin(), seqno_time_mapping_.end(), cut_off_time,
@ -238,7 +238,11 @@ bool SeqnoToTimeMapping::Resize(uint64_t min_time_duration,
} }
Status SeqnoToTimeMapping::Sort() { Status SeqnoToTimeMapping::Sort() {
if (is_sorted_ || seqno_time_mapping_.empty()) { if (is_sorted_) {
return Status::OK();
}
if (seqno_time_mapping_.empty()) {
is_sorted_ = true;
return Status::OK(); return Status::OK();
} }

@ -29,6 +29,9 @@ constexpr uint64_t kUnknownSeqnoTime = 0;
// would be 300. // would be 300.
// As it's a sorted list, the new entry is inserted from the back. The old data // As it's a sorted list, the new entry is inserted from the back. The old data
// will be popped from the front if they're no longer used. // will be popped from the front if they're no longer used.
//
// Note: the data struct is not thread safe, both read and write need to be
// synchronized by caller.
class SeqnoToTimeMapping { class SeqnoToTimeMapping {
public: public:
// Maximum number of entries can be encoded into SST. The data is delta encode // Maximum number of entries can be encoded into SST. The data is delta encode

Loading…
Cancel
Save