You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
rocksdb/include/rocksdb/perf_context.h

261 lines
11 KiB

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <stdint.h>
#include <map>
#include <string>
#include "rocksdb/perf_level.h"
namespace ROCKSDB_NAMESPACE {
// A thread local context for gathering performance counter efficiently
// and transparently.
// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
// Break down performance counters by level and store per-level perf context in
// PerfContextByLevel
struct PerfContextByLevel {
// # of times bloom filter has avoided file reads, i.e., negatives.
uint64_t bloom_filter_useful = 0;
// # of times bloom FullFilter has not avoided the reads.
uint64_t bloom_filter_full_positive = 0;
// # of times bloom FullFilter has not avoided the reads and data actually
// exist.
uint64_t bloom_filter_full_true_positive = 0;
// total number of user key returned (only include keys that are found, does
// not include keys that are deleted or merged without a final put
uint64_t user_key_return_count = 0;
// total nanos spent on reading data from SST files
uint64_t get_from_table_nanos = 0;
uint64_t block_cache_hit_count = 0; // total number of block cache hits
uint64_t block_cache_miss_count = 0; // total number of block cache misses
void Reset(); // reset all performance counters to zero
};
struct PerfContext {
~PerfContext();
PerfContext() {}
PerfContext(const PerfContext&);
PerfContext& operator=(const PerfContext&);
PerfContext(PerfContext&&) noexcept;
void Reset(); // reset all performance counters to zero
std::string ToString(bool exclude_zero_counters = false) const;
// enable per level perf context and allocate storage for PerfContextByLevel
void EnablePerLevelPerfContext();
// temporarily disable per level perf context by setting the flag to false
void DisablePerLevelPerfContext();
// free the space for PerfContextByLevel, also disable per level perf context
void ClearPerLevelPerfContext();
uint64_t user_key_comparison_count; // total number of user key comparisons
uint64_t block_cache_hit_count; // total number of block cache hits
uint64_t block_read_count; // total number of block reads (with IO)
uint64_t block_read_byte; // total number of bytes from block reads
uint64_t block_read_time; // total nanos spent on block reads
uint64_t block_cache_index_hit_count; // total number of index block hits
uint64_t index_block_read_count; // total number of index block reads
uint64_t block_cache_filter_hit_count; // total number of filter block hits
uint64_t filter_block_read_count; // total number of filter block reads
uint64_t compression_dict_block_read_count; // total number of compression
// dictionary block reads
uint64_t secondary_cache_hit_count; // total number of secondary cache hits
uint64_t block_checksum_time; // total nanos spent on block checksum
uint64_t block_decompress_time; // total nanos spent on block decompression
uint64_t get_read_bytes; // bytes for vals returned by Get
uint64_t multiget_read_bytes; // bytes for vals returned by MultiGet
uint64_t iter_read_bytes; // bytes for keys/vals decoded by iterator
uint64_t blob_cache_hit_count; // total number of blob cache hits
uint64_t blob_read_count; // total number of blob reads (with IO)
uint64_t blob_read_byte; // total number of bytes from blob reads
uint64_t blob_read_time; // total nanos spent on blob reads
uint64_t blob_checksum_time; // total nanos spent on blob checksum
uint64_t blob_decompress_time; // total nanos spent on blob decompression
// total number of internal keys skipped over during iteration.
// There are several reasons for it:
// 1. when calling Next(), the iterator is in the position of the previous
// key, so that we'll need to skip it. It means this counter will always
// be incremented in Next().
// 2. when calling Next(), we need to skip internal entries for the previous
// keys that are overwritten.
// 3. when calling Next(), Seek() or SeekToFirst(), after previous key
// before calling Next(), the seek key in Seek() or the beginning for
// SeekToFirst(), there may be one or more deleted keys before the next
// valid key that the operation should place the iterator to. We need
// to skip both of the tombstone and updates hidden by the tombstones. The
// tombstones are not included in this counter, while previous updates
// hidden by the tombstones will be included here.
// 4. symmetric cases for Prev() and SeekToLast()
// internal_recent_skipped_count is not included in this counter.
//
uint64_t internal_key_skipped_count;
// Total number of deletes and single deletes skipped over during iteration
// When calling Next(), Seek() or SeekToFirst(), after previous position
// before calling Next(), the seek key in Seek() or the beginning for
// SeekToFirst(), there may be one or more deleted keys before the next valid
// key. Every deleted key is counted once. We don't recount here if there are
// still older updates invalidated by the tombstones.
//
uint64_t internal_delete_skipped_count;
// How many times iterators skipped over internal keys that are more recent
// than the snapshot that iterator is using.
//
uint64_t internal_recent_skipped_count;
// How many values were fed into merge operator by iterators.
//
uint64_t internal_merge_count;
Skip swaths of range tombstone covered keys in merging iterator (2022 edition) (#10449) Summary: Delete range logic is moved from `DBIter` to `MergingIterator`, and `MergingIterator` will seek to the end of a range deletion if possible instead of scanning through each key and check with `RangeDelAggregator`. With the invariant that a key in level L (consider memtable as the first level, each immutable and L0 as a separate level) has a larger sequence number than all keys in any level >L, a range tombstone `[start, end)` from level L covers all keys in its range in any level >L. This property motivates optimizations in iterator: - in `Seek(target)`, if level L has a range tombstone `[start, end)` that covers `target.UserKey`, then for all levels > L, we can do Seek() on `end` instead of `target` to skip some range tombstone covered keys. - in `Next()/Prev()`, if the current key is covered by a range tombstone `[start, end)` from level L, we can do `Seek` to `end` for all levels > L. This PR implements the above optimizations in `MergingIterator`. As all range tombstone covered keys are now skipped in `MergingIterator`, the range tombstone logic is removed from `DBIter`. The idea in this PR is similar to https://github.com/facebook/rocksdb/issues/7317, but this PR leaves `InternalIterator` interface mostly unchanged. **Credit**: the cascading seek optimization and the sentinel key (discussed below) are inspired by [Pebble](https://github.com/cockroachdb/pebble/blob/master/merging_iter.go) and suggested by ajkr in https://github.com/facebook/rocksdb/issues/7317. The two optimizations are mostly implemented in `SeekImpl()/SeekForPrevImpl()` and `IsNextDeleted()/IsPrevDeleted()` in `merging_iterator.cc`. See comments for each method for more detail. One notable change is that the minHeap/maxHeap used by `MergingIterator` now contains range tombstone end keys besides point key iterators. This helps to reduce the number of key comparisons. For example, for a range tombstone `[start, end)`, a `start` and an `end` `HeapItem` are inserted into the heap. When a `HeapItem` for range tombstone start key is popped from the minHeap, we know this range tombstone becomes "active" in the sense that, before the range tombstone's end key is popped from the minHeap, all the keys popped from this heap is covered by the range tombstone's internal key range `[start, end)`. Another major change, *delete range sentinel key*, is made to `LevelIterator`. Before this PR, when all point keys in an SST file are iterated through in `MergingIterator`, a level iterator would advance to the next SST file in its level. In the case when an SST file has a range tombstone that covers keys beyond the SST file's last point key, advancing to the next SST file would lose this range tombstone. Consequently, `MergingIterator` could return keys that should have been deleted by some range tombstone. We prevent this by pretending that file boundaries in each SST file are sentinel keys. A `LevelIterator` now only advance the file iterator once the sentinel key is processed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10449 Test Plan: - Added many unit tests in db_range_del_test - Stress test: `./db_stress --readpercent=5 --prefixpercent=19 --writepercent=20 -delpercent=10 --iterpercent=44 --delrangepercent=2` - Additional iterator stress test is added to verify against iterators against expected state: https://github.com/facebook/rocksdb/issues/10538. This is based on ajkr's previous attempt https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913. ``` python3 ./tools/db_crashtest.py blackbox --simple --write_buffer_size=524288 --target_file_size_base=524288 --max_bytes_for_level_base=2097152 --compression_type=none --max_background_compactions=8 --value_size_mult=33 --max_key=5000000 --interval=10 --duration=7200 --delrangepercent=3 --delpercent=9 --iterpercent=25 --writepercent=60 --readpercent=3 --prefixpercent=0 --num_iterations=1000 --range_deletion_width=100 --verify_iterator_with_expected_state_one_in=1 ``` - Performance benchmark: I used a similar setup as in the blog [post](http://rocksdb.org/blog/2018/11/21/delete-range.html) that introduced DeleteRange, "a database with 5 million data keys, and 10000 range tombstones (ignoring those dropped during compaction) that were written in regular intervals after 4.5 million data keys were written". As expected, the performance with this PR depends on the range tombstone width. ``` # Setup: TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=fillrandom --writes=4500000 --num=5000000 TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=overwrite --writes=500000 --num=5000000 --use_existing_db=true --writes_per_range_tombstone=50 # Scan entire DB TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=readseq[-X5] --use_existing_db=true --num=5000000 --disable_auto_compactions=true # Short range scan (10 Next()) TEST_TMPDIR=/dev/shm/width-100/ ./db_bench_main --benchmarks=seekrandom[-X5] --use_existing_db=true --num=500000 --reads=100000 --seek_nexts=10 --disable_auto_compactions=true # Long range scan(1000 Next()) TEST_TMPDIR=/dev/shm/width-100/ ./db_bench_main --benchmarks=seekrandom[-X5] --use_existing_db=true --num=500000 --reads=2500 --seek_nexts=1000 --disable_auto_compactions=true ``` Avg over of 10 runs (some slower tests had fews runs): For the first column (tombstone), 0 means no range tombstone, 100-10000 means width of the 10k range tombstones, and 1 means there is a single range tombstone in the entire DB (width is 1000). The 1 tombstone case is to test regression when there's very few range tombstones in the DB, as no range tombstone is likely to take a different code path than with range tombstones. - Scan entire DB | tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% | | ------------- | ------------- | ------------- | ------------- | | 0 range tombstone |2525600 (± 43564) |2486917 (± 33698) |-1.53% | | 100 |1853835 (± 24736) |2073884 (± 32176) |+11.87% | | 1000 |422415 (± 7466) |1115801 (± 22781) |+164.15% | | 10000 |22384 (± 227) |227919 (± 6647) |+918.22% | | 1 range tombstone |2176540 (± 39050) |2434954 (± 24563) |+11.87% | - Short range scan | tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% | | ------------- | ------------- | ------------- | ------------- | | 0 range tombstone |35398 (± 533) |35338 (± 569) |-0.17% | | 100 |28276 (± 664) |31684 (± 331) |+12.05% | | 1000 |7637 (± 77) |25422 (± 277) |+232.88% | | 10000 |1367 |28667 |+1997.07% | | 1 range tombstone |32618 (± 581) |32748 (± 506) |+0.4% | - Long range scan | tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% | | ------------- | ------------- | ------------- | ------------- | | 0 range tombstone |2262 (± 33) |2353 (± 20) |+4.02% | | 100 |1696 (± 26) |1926 (± 18) |+13.56% | | 1000 |410 (± 6) |1255 (± 29) |+206.1% | | 10000 |25 |414 |+1556.0% | | 1 range tombstone |1957 (± 30) |2185 (± 44) |+11.65% | - Microbench does not show significant regression: https://gist.github.com/cbi42/59f280f85a59b678e7e5d8561e693b61 Reviewed By: ajkr Differential Revision: D38450331 Pulled By: cbi42 fbshipit-source-id: b5ef12e8d8c289ed2e163ccdf277f5039b511fca
3 years ago
// Number of times we reseeked inside a merging iterator, specifically to skip
// after or before a range of keys covered by a range deletion in a newer LSM
// component.
uint64_t internal_range_del_reseek_count;
uint64_t get_snapshot_time; // total nanos spent on getting snapshot
uint64_t get_from_memtable_time; // total nanos spent on querying memtables
uint64_t get_from_memtable_count; // number of mem tables queried
// total nanos spent after Get() finds a key
uint64_t get_post_process_time;
uint64_t get_from_output_files_time; // total nanos reading from output files
// total nanos spent on seeking memtable
uint64_t seek_on_memtable_time;
// number of seeks issued on memtable
// (including SeekForPrev but not SeekToFirst and SeekToLast)
uint64_t seek_on_memtable_count;
// number of Next()s issued on memtable
uint64_t next_on_memtable_count;
// number of Prev()s issued on memtable
uint64_t prev_on_memtable_count;
// total nanos spent on seeking child iters
uint64_t seek_child_seek_time;
// number of seek issued in child iterators
uint64_t seek_child_seek_count;
uint64_t seek_min_heap_time; // total nanos spent on the merge min heap
uint64_t seek_max_heap_time; // total nanos spent on the merge max heap
// total nanos spent on seeking the internal entries
uint64_t seek_internal_seek_time;
// total nanos spent on iterating internal entries to find the next user entry
uint64_t find_next_user_entry_time;
// This group of stats provide a breakdown of time spent by Write().
// May be inaccurate when 2PC, two_write_queues or enable_pipelined_write
// are enabled.
//
// total nanos spent on writing to WAL
uint64_t write_wal_time;
// total nanos spent on writing to mem tables
uint64_t write_memtable_time;
// total nanos spent on delaying or throttling write
uint64_t write_delay_time;
// total nanos spent on switching memtable/wal and scheduling
// flushes/compactions.
uint64_t write_scheduling_flushes_compactions_time;
// total nanos spent on writing a record, excluding the above four things
uint64_t write_pre_and_post_process_time;
// time spent waiting for other threads of the batch group
uint64_t write_thread_wait_nanos;
// time spent on acquiring DB mutex.
uint64_t db_mutex_lock_nanos;
// Time spent on waiting with a condition variable created with DB mutex.
uint64_t db_condition_wait_nanos;
// Time spent on merge operator.
uint64_t merge_operator_time_nanos;
// Time spent on reading index block from block cache or SST file
uint64_t read_index_block_nanos;
// Time spent on reading filter block from block cache or SST file
uint64_t read_filter_block_nanos;
// Time spent on creating data block iterator
uint64_t new_table_block_iter_nanos;
// Time spent on creating a iterator of an SST file.
uint64_t new_table_iterator_nanos;
// Time spent on seeking a key in data/index blocks
uint64_t block_seek_nanos;
// Time spent on finding or creating a table reader
uint64_t find_table_nanos;
// total number of mem table bloom hits
uint64_t bloom_memtable_hit_count;
// total number of mem table bloom misses
uint64_t bloom_memtable_miss_count;
// total number of SST table bloom hits
uint64_t bloom_sst_hit_count;
// total number of SST table bloom misses
uint64_t bloom_sst_miss_count;
// Time spent waiting on key locks in transaction lock manager.
uint64_t key_lock_wait_time;
// number of times acquiring a lock was blocked by another transaction.
uint64_t key_lock_wait_count;
// Total time spent in Env filesystem operations. These are only populated
// when TimedEnv is used.
uint64_t env_new_sequential_file_nanos;
uint64_t env_new_random_access_file_nanos;
uint64_t env_new_writable_file_nanos;
uint64_t env_reuse_writable_file_nanos;
uint64_t env_new_random_rw_file_nanos;
uint64_t env_new_directory_nanos;
uint64_t env_file_exists_nanos;
uint64_t env_get_children_nanos;
uint64_t env_get_children_file_attributes_nanos;
uint64_t env_delete_file_nanos;
uint64_t env_create_dir_nanos;
uint64_t env_create_dir_if_missing_nanos;
uint64_t env_delete_dir_nanos;
uint64_t env_get_file_size_nanos;
uint64_t env_get_file_modification_time_nanos;
uint64_t env_rename_file_nanos;
uint64_t env_link_file_nanos;
uint64_t env_lock_file_nanos;
uint64_t env_unlock_file_nanos;
uint64_t env_new_logger_nanos;
uint64_t get_cpu_nanos;
uint64_t iter_next_cpu_nanos;
uint64_t iter_prev_cpu_nanos;
uint64_t iter_seek_cpu_nanos;
// Time spent in encrypting data. Populated when EncryptedEnv is used.
uint64_t encrypt_data_nanos;
// Time spent in decrypting data. Populated when EncryptedEnv is used.
uint64_t decrypt_data_nanos;
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
3 years ago
uint64_t number_async_seek;
std::map<uint32_t, PerfContextByLevel>* level_to_perf_context = nullptr;
bool per_level_perf_context_enabled = false;
};
// If RocksDB is compiled with -DNPERF_CONTEXT, then a pointer to a global,
// non-thread-local PerfContext object will be returned. Attempts to update
// this object will be ignored, and reading from it will also be no-op.
// Otherwise,
// a) if thread-local is supported on the platform, then a pointer to
// a thread-local PerfContext object will be returned.
// b) if thread-local is NOT supported, then compilation will fail.
//
// This function never returns nullptr.
PerfContext* get_perf_context();
} // namespace ROCKSDB_NAMESPACE