diff --git a/HISTORY.md b/HISTORY.md index 4d6b089f1..516eb673b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,9 +3,11 @@ ### Public API Change * For users of `Statistics` objects created via `CreateDBStatistics()`, the format of the string returned by its `ToString()` method has changed. * With LRUCache, when high_pri_pool_ratio > 0, midpoint insertion strategy will be enabled to put low-pri items to the tail of low-pri list (the midpoint) when they first inserted into the cache. This is to make cache entries never get hit age out faster, improving cache efficiency when large background scan presents. +* The "rocksdb.num.entries" table property no longer counts range deletion tombstones as entries. ### New Features * Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used. +* Add a new table property, "rocksdb.num.range-deletions", which counts the number of range deletion tombstones in the table. * Improve the performance of iterators doing long range scans by using readahead, when using direct IO. * pin_top_level_index_and_filter (default true) in BlockBasedTableOptions can be used in combination with cache_index_and_filter_blocks to prefetch and pin the top-level index of partitioned index and filter blocks in cache. It has no impact when cache_index_and_filter_blocks is false. diff --git a/db/builder.cc b/db/builder.cc index cb45bb093..a6d87358f 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -161,7 +161,8 @@ Status BuildTable( nullptr /* upper_bound */, meta); // Finish and check for builder errors - bool empty = builder->NumEntries() == 0; + tp = builder->GetTableProperties(); + bool empty = builder->NumEntries() == 0 && tp.num_range_deletions == 0; s = c_iter.status(); if (!s.ok() || empty) { builder->Abandon(); @@ -174,7 +175,7 @@ Status BuildTable( meta->fd.file_size = file_size; meta->marked_for_compaction = builder->NeedCompact(); assert(meta->fd.GetFileSize() > 0); - tp = builder->GetTableProperties(); + tp = builder->GetTableProperties(); // refresh now that builder is finished if (table_properties) { *table_properties = tp; } diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 12aa1d41b..09496637c 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1217,7 +1217,12 @@ Status CompactionJob::FinishCompactionOutputFile( } sub_compact->outfile.reset(); - if (s.ok() && current_entries == 0) { + TableProperties tp; + if (s.ok()) { + tp = sub_compact->builder->GetTableProperties(); + } + + if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) { // If there is nothing to output, no necessary to generate a sst file. // This happens when the output level is bottom level, at the same time // the sub_compact output nothing. @@ -1236,10 +1241,8 @@ Status CompactionJob::FinishCompactionOutputFile( } ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); - TableProperties tp; - if (s.ok() && current_entries > 0) { + if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) { // Output to event logger and fire events. - tp = sub_compact->builder->GetTableProperties(); sub_compact->current_output()->table_properties = std::make_shared(tp); ROCKS_LOG_INFO(db_options_.info_log, diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index b38fe0352..819758e3f 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -170,6 +170,7 @@ void ResetTableProperties(TableProperties* tp) { tp->raw_value_size = 0; tp->num_data_blocks = 0; tp->num_entries = 0; + tp->num_range_deletions = 0; } void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { @@ -178,15 +179,18 @@ void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { std::replace(tp_string.begin(), tp_string.end(), '=', ' '); ResetTableProperties(tp); sscanf(tp_string.c_str(), - "# data blocks %" SCNu64 " # entries %" SCNu64 " raw key size %" SCNu64 + "# data blocks %" SCNu64 " # entries %" SCNu64 + " # range deletions %" SCNu64 + " raw key size %" SCNu64 " raw average key size %lf " " raw value size %" SCNu64 " raw average value size %lf " " data block size %" SCNu64 " index block size (user-key? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64, - &tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size, - &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, - &tp->index_key_is_user_key, &tp->index_size, &tp->filter_size); + &tp->num_data_blocks, &tp->num_entries, &tp->num_range_deletions, + &tp->raw_key_size, &dummy_double, &tp->raw_value_size, &dummy_double, + &tp->data_size, &tp->index_key_is_user_key, &tp->index_size, + &tp->filter_size); } void VerifySimilar(uint64_t a, uint64_t b, double bias) { @@ -217,20 +221,25 @@ void VerifyTableProperties(const TableProperties& base_tp, ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size); ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size); ASSERT_EQ(base_tp.num_entries, new_tp.num_entries); + ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions); } void GetExpectedTableProperties(TableProperties* expected_tp, const int kKeySize, const int kValueSize, - const int kKeysPerTable, const int kTableCount, + const int kKeysPerTable, + const int kRangeDeletionsPerTable, + const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize, const bool index_key_is_user_key) { const int kKeyCount = kTableCount * kKeysPerTable; + const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable; const int kAvgSuccessorSize = kKeySize / 5; const int kEncodingSavePerKey = kKeySize / 4; - expected_tp->raw_key_size = kKeyCount * (kKeySize + 8); - expected_tp->raw_value_size = kKeyCount * kValueSize; + expected_tp->raw_key_size = (kKeyCount + kRangeDeletionCount) * (kKeySize + 8); + expected_tp->raw_value_size = (kKeyCount + kRangeDeletionCount) * kValueSize; expected_tp->num_entries = kKeyCount; + expected_tp->num_range_deletions = kRangeDeletionCount; expected_tp->num_data_blocks = kTableCount * (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) / @@ -291,6 +300,7 @@ TEST_F(DBPropertiesTest, ValidateSampleNumber) { TEST_F(DBPropertiesTest, AggregatedTableProperties) { for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) { + const int kRangeDeletionsPerTable = 5; const int kKeysPerTable = 100; const int kKeySize = 80; const int kValueSize = 200; @@ -309,12 +319,22 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) { DestroyAndReopen(options); + // Hold open a snapshot to prevent range tombstones from being compacted + // away. + ManagedSnapshot snapshot(db_); + Random rnd(5632); for (int table = 1; table <= kTableCount; ++table) { for (int i = 0; i < kKeysPerTable; ++i) { db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); } + for (int i = 0; i < kRangeDeletionsPerTable; i++) { + std::string start = RandomString(&rnd, kKeySize); + std::string end = start; + end.resize(kValueSize); + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end); + } db_->Flush(FlushOptions()); } std::string property; @@ -325,7 +345,8 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) { TableProperties expected_tp; GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, - kKeysPerTable, kTableCount, kBloomBitsPerKey, + kKeysPerTable, kRangeDeletionsPerTable, + kTableCount, kBloomBitsPerKey, table_options.block_size, index_key_is_user_key); VerifyTableProperties(expected_tp, output_tp); @@ -448,6 +469,7 @@ TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) { TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { const int kTableCount = 100; + const int kRangeDeletionsPerTable = 2; const int kKeysPerTable = 10; const int kKeySize = 50; const int kValueSize = 400; @@ -473,6 +495,9 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { DestroyAndReopen(options); + // Hold open a snapshot to prevent range tombstones from being compacted away. + ManagedSnapshot snapshot(db_); + std::string level_tp_strings[kMaxLevel]; std::string tp_string; TableProperties level_tps[kMaxLevel]; @@ -482,6 +507,12 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); } + for (int i = 0; i < kRangeDeletionsPerTable; i++) { + std::string start = RandomString(&rnd, kKeySize); + std::string end = start; + end.resize(kValueSize); + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end); + } db_->Flush(FlushOptions()); db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); ResetTableProperties(&sum_tp); @@ -497,6 +528,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { sum_tp.raw_value_size += level_tps[level].raw_value_size; sum_tp.num_data_blocks += level_tps[level].num_data_blocks; sum_tp.num_entries += level_tps[level].num_entries; + sum_tp.num_range_deletions += level_tps[level].num_range_deletions; } db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string); ParseTablePropertiesString(tp_string, &tp); @@ -508,13 +540,15 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size); ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks); ASSERT_EQ(sum_tp.num_entries, tp.num_entries); + ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions); if (table > 3) { GetExpectedTableProperties( - &expected_tp, kKeySize, kValueSize, kKeysPerTable, table, - kBloomBitsPerKey, table_options.block_size, index_key_is_user_key); + &expected_tp, kKeySize, kValueSize, kKeysPerTable, + kRangeDeletionsPerTable, table, kBloomBitsPerKey, + table_options.block_size, index_key_is_user_key); // Gives larger bias here as index block size, filter block size, // and data block size become much harder to estimate in this test. - VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25); + VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25); } } } diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index 18165922a..32ddb6c98 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -39,6 +39,7 @@ struct TablePropertiesNames { static const std::string kRawValueSize; static const std::string kNumDataBlocks; static const std::string kNumEntries; + static const std::string kNumRangeDeletions; static const std::string kFormatVersion; static const std::string kFixedKeyLen; static const std::string kFilterPolicy; @@ -148,6 +149,8 @@ struct TableProperties { uint64_t num_data_blocks = 0; // the number of entries in this table uint64_t num_entries = 0; + // the number of range deletions in this table + uint64_t num_range_deletions = 0; // format version, reserved for backward compatibility uint64_t format_version = 0; // If 0, key is variable length. Otherwise number of bytes for each key. diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 030e2fb2f..7d7faf60d 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -444,9 +444,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->ioptions.info_log); } else if (value_type == kTypeRangeDeletion) { - // TODO(wanning&andrewkr) add num_tomestone to table properties r->range_del_block.Add(key, value); - ++r->props.num_entries; + ++r->props.num_range_deletions; r->props.raw_key_size += key.size(); r->props.raw_value_size += value.size(); NotifyCollectTableCollectorsOnAdd(key, value, r->offset, diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index ce508ce1f..d6076affa 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -77,6 +77,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { } Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key); Add(TablePropertiesNames::kNumEntries, props.num_entries); + Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions); Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks); Add(TablePropertiesNames::kFilterSize, props.filter_size); Add(TablePropertiesNames::kFormatVersion, props.format_version); @@ -224,6 +225,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, {TablePropertiesNames::kNumDataBlocks, &new_table_properties->num_data_blocks}, {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries}, + {TablePropertiesNames::kNumRangeDeletions, + &new_table_properties->num_range_deletions}, {TablePropertiesNames::kFormatVersion, &new_table_properties->format_version}, {TablePropertiesNames::kFixedKeyLen, diff --git a/table/table_properties.cc b/table/table_properties.cc index 4d75abdb3..9c1c4bd8e 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -78,6 +78,8 @@ std::string TableProperties::ToString( AppendProperty(result, "# data blocks", num_data_blocks, prop_delim, kv_delim); AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim); + AppendProperty(result, "# range deletions", num_range_deletions, prop_delim, + kv_delim); AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim); AppendProperty(result, "raw average key size", @@ -166,6 +168,7 @@ void TableProperties::Add(const TableProperties& tp) { raw_value_size += tp.raw_value_size; num_data_blocks += tp.num_data_blocks; num_entries += tp.num_entries; + num_range_deletions += tp.num_range_deletions; } const std::string TablePropertiesNames::kDataSize = @@ -188,6 +191,8 @@ const std::string TablePropertiesNames::kNumDataBlocks = "rocksdb.num.data.blocks"; const std::string TablePropertiesNames::kNumEntries = "rocksdb.num.entries"; +const std::string TablePropertiesNames::kNumRangeDeletions = + "rocksdb.num.range-deletions"; const std::string TablePropertiesNames::kFilterPolicy = "rocksdb.filter.policy"; const std::string TablePropertiesNames::kFormatVersion =