You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
rocksdb/include/rocksdb/statistics.h

399 lines
16 KiB

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
#include <atomic>
#include <cstddef>
#include <cstdint>
#include <string>
#include <memory>
#include <vector>
namespace rocksdb {
/**
* Keep adding ticker's here.
* 1. Any ticker should be added before TICKER_ENUM_MAX.
* 2. Add a readable string in TickersNameMap below for the newly added ticker.
*/
enum Tickers : uint32_t {
// total block cache misses
// REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
// BLOCK_CACHE_FILTER_MISS +
// BLOCK_CACHE_DATA_MISS;
BLOCK_CACHE_MISS = 0,
// total block cache hit
// REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
// BLOCK_CACHE_FILTER_HIT +
// BLOCK_CACHE_DATA_HIT;
BLOCK_CACHE_HIT,
// # of blocks added to block cache.
BLOCK_CACHE_ADD,
// # of failures when adding blocks to block cache.
BLOCK_CACHE_ADD_FAILURES,
// # of times cache miss when accessing index block from block cache.
BLOCK_CACHE_INDEX_MISS,
// # of times cache hit when accessing index block from block cache.
BLOCK_CACHE_INDEX_HIT,
Add statistics field to show total size of index and filter blocks in block cache Summary: With `table_options.cache_index_and_filter_blocks = true`, index and filter blocks are stored in block cache. Then people are curious how much of the block cache total size is used by indexes and bloom filters. It will be nice we have a way to report that. It can help people tune performance and plan for optimized hardware setting. We add several enum values for db Statistics. BLOCK_CACHE_INDEX/FILTER_BYTES_INSERT - BLOCK_CACHE_INDEX/FILTER_BYTES_ERASE = current INDEX/FILTER total block size in bytes. Test Plan: write a test case called `DBBlockCacheTest.IndexAndFilterBlocksStats`. The result is: ``` [gzh@dev9927.prn1 ~/local/rocksdb] make db_block_cache_test -j64 && ./db_block_cache_test --gtest_filter=DBBlockCacheTest.IndexAndFilterBlocksStats Makefile:101: Warning: Compiling in debug mode. Don't use the resulting binary in production GEN util/build_version.cc make: `db_block_cache_test' is up to date. Note: Google Test filter = DBBlockCacheTest.IndexAndFilterBlocksStats [==========] Running 1 test from 1 test case. [----------] Global test environment set-up. [----------] 1 test from DBBlockCacheTest [ RUN ] DBBlockCacheTest.IndexAndFilterBlocksStats [ OK ] DBBlockCacheTest.IndexAndFilterBlocksStats (689 ms) [----------] 1 test from DBBlockCacheTest (689 ms total) [----------] Global test environment tear-down [==========] 1 test from 1 test case ran. (689 ms total) [ PASSED ] 1 test. ``` Reviewers: IslamAbdelRahman, andrewkr, sdong Reviewed By: sdong Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D58677
9 years ago
// # of bytes of index blocks inserted into cache
BLOCK_CACHE_INDEX_BYTES_INSERT,
// # of bytes of index block erased from cache
BLOCK_CACHE_INDEX_BYTES_EVICT,
// # of times cache miss when accessing filter block from block cache.
BLOCK_CACHE_FILTER_MISS,
// # of times cache hit when accessing filter block from block cache.
BLOCK_CACHE_FILTER_HIT,
Add statistics field to show total size of index and filter blocks in block cache Summary: With `table_options.cache_index_and_filter_blocks = true`, index and filter blocks are stored in block cache. Then people are curious how much of the block cache total size is used by indexes and bloom filters. It will be nice we have a way to report that. It can help people tune performance and plan for optimized hardware setting. We add several enum values for db Statistics. BLOCK_CACHE_INDEX/FILTER_BYTES_INSERT - BLOCK_CACHE_INDEX/FILTER_BYTES_ERASE = current INDEX/FILTER total block size in bytes. Test Plan: write a test case called `DBBlockCacheTest.IndexAndFilterBlocksStats`. The result is: ``` [gzh@dev9927.prn1 ~/local/rocksdb] make db_block_cache_test -j64 && ./db_block_cache_test --gtest_filter=DBBlockCacheTest.IndexAndFilterBlocksStats Makefile:101: Warning: Compiling in debug mode. Don't use the resulting binary in production GEN util/build_version.cc make: `db_block_cache_test' is up to date. Note: Google Test filter = DBBlockCacheTest.IndexAndFilterBlocksStats [==========] Running 1 test from 1 test case. [----------] Global test environment set-up. [----------] 1 test from DBBlockCacheTest [ RUN ] DBBlockCacheTest.IndexAndFilterBlocksStats [ OK ] DBBlockCacheTest.IndexAndFilterBlocksStats (689 ms) [----------] 1 test from DBBlockCacheTest (689 ms total) [----------] Global test environment tear-down [==========] 1 test from 1 test case ran. (689 ms total) [ PASSED ] 1 test. ``` Reviewers: IslamAbdelRahman, andrewkr, sdong Reviewed By: sdong Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D58677
9 years ago
// # of bytes of bloom filter blocks inserted into cache
BLOCK_CACHE_FILTER_BYTES_INSERT,
// # of bytes of bloom filter block erased from cache
BLOCK_CACHE_FILTER_BYTES_EVICT,
// # of times cache miss when accessing data block from block cache.
BLOCK_CACHE_DATA_MISS,
// # of times cache hit when accessing data block from block cache.
BLOCK_CACHE_DATA_HIT,
// # of bytes read from cache.
BLOCK_CACHE_BYTES_READ,
// # of bytes written into cache.
BLOCK_CACHE_BYTES_WRITE,
Add statistics field to show total size of index and filter blocks in block cache Summary: With `table_options.cache_index_and_filter_blocks = true`, index and filter blocks are stored in block cache. Then people are curious how much of the block cache total size is used by indexes and bloom filters. It will be nice we have a way to report that. It can help people tune performance and plan for optimized hardware setting. We add several enum values for db Statistics. BLOCK_CACHE_INDEX/FILTER_BYTES_INSERT - BLOCK_CACHE_INDEX/FILTER_BYTES_ERASE = current INDEX/FILTER total block size in bytes. Test Plan: write a test case called `DBBlockCacheTest.IndexAndFilterBlocksStats`. The result is: ``` [gzh@dev9927.prn1 ~/local/rocksdb] make db_block_cache_test -j64 && ./db_block_cache_test --gtest_filter=DBBlockCacheTest.IndexAndFilterBlocksStats Makefile:101: Warning: Compiling in debug mode. Don't use the resulting binary in production GEN util/build_version.cc make: `db_block_cache_test' is up to date. Note: Google Test filter = DBBlockCacheTest.IndexAndFilterBlocksStats [==========] Running 1 test from 1 test case. [----------] Global test environment set-up. [----------] 1 test from DBBlockCacheTest [ RUN ] DBBlockCacheTest.IndexAndFilterBlocksStats [ OK ] DBBlockCacheTest.IndexAndFilterBlocksStats (689 ms) [----------] 1 test from DBBlockCacheTest (689 ms total) [----------] Global test environment tear-down [==========] 1 test from 1 test case ran. (689 ms total) [ PASSED ] 1 test. ``` Reviewers: IslamAbdelRahman, andrewkr, sdong Reviewed By: sdong Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D58677
9 years ago
// # of times bloom filter has avoided file reads.
BLOOM_FILTER_USEFUL,
// # persistent cache hit
PERSISTENT_CACHE_HIT,
// # persistent cache miss
PERSISTENT_CACHE_MISS,
// # of memtable hits.
MEMTABLE_HIT,
// # of memtable misses.
MEMTABLE_MISS,
// # of Get() queries served by L0
GET_HIT_L0,
// # of Get() queries served by L1
GET_HIT_L1,
// # of Get() queries served by L2 and up
GET_HIT_L2_AND_UP,
/**
* COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
* There are 3 reasons currently.
*/
COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value.
COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete.
COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key.
// Number of keys written to the database via the Put and Write call's
NUMBER_KEYS_WRITTEN,
// Number of Keys read,
NUMBER_KEYS_READ,
// Number keys updated, if inplace update is enabled
NUMBER_KEYS_UPDATED,
// The number of uncompressed bytes issued by DB::Put(), DB::Delete(),
// DB::Merge(), and DB::Write().
BYTES_WRITTEN,
// The number of uncompressed bytes read from DB::Get(). It could be
// either from memtables, cache, or table files.
// For the number of logical bytes read from DB::MultiGet(),
// please use NUMBER_MULTIGET_BYTES_READ.
BYTES_READ,
// The number of calls to seek/next/prev
NUMBER_DB_SEEK,
NUMBER_DB_NEXT,
NUMBER_DB_PREV,
// The number of calls to seek/next/prev that returned data
NUMBER_DB_SEEK_FOUND,
NUMBER_DB_NEXT_FOUND,
NUMBER_DB_PREV_FOUND,
// The number of uncompressed bytes read from an iterator.
// Includes size of key and value.
ITER_BYTES_READ,
NO_FILE_CLOSES,
NO_FILE_OPENS,
NO_FILE_ERRORS,
// DEPRECATED Time system had to wait to do LO-L1 compactions
STALL_L0_SLOWDOWN_MICROS,
// DEPRECATED Time system had to wait to move memtable to L1.
STALL_MEMTABLE_COMPACTION_MICROS,
// DEPRECATED write throttle because of too many files in L0
STALL_L0_NUM_FILES_MICROS,
// Writer has to wait for compaction or flush to finish.
STALL_MICROS,
// The wait time for db mutex.
// Disabled by default. To enable it set stats level to kAll
DB_MUTEX_WAIT_MICROS,
RATE_LIMIT_DELAY_MILLIS,
NO_ITERATORS, // number of iterators currently open
// Number of MultiGet calls, keys read, and bytes read
NUMBER_MULTIGET_CALLS,
NUMBER_MULTIGET_KEYS_READ,
NUMBER_MULTIGET_BYTES_READ,
// Number of deletes records that were not required to be
// written to storage because key does not exist
NUMBER_FILTERED_DELETES,
NUMBER_MERGE_FAILURES,
SEQUENCE_NUMBER,
// number of times bloom was checked before creating iterator on a
// file, and the number of times the check was useful in avoiding
// iterator creation (and thus likely IOPs).
BLOOM_FILTER_PREFIX_CHECKED,
BLOOM_FILTER_PREFIX_USEFUL,
// Number of times we had to reseek inside an iteration to skip
// over large number of keys with same userkey.
NUMBER_OF_RESEEKS_IN_ITERATION,
// Record the number of calls to GetUpadtesSince. Useful to keep track of
// transaction log iterator refreshes
GET_UPDATES_SINCE_CALLS,
BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache
// Number of blocks added to comopressed block cache
BLOCK_CACHE_COMPRESSED_ADD,
// Number of failures when adding blocks to compressed block cache
BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
WAL_FILE_SYNCED, // Number of times WAL sync is done
WAL_FILE_BYTES, // Number of bytes written to WAL
Add monitoring for universal compaction and add counters for compaction IO Summary: Adds these counters { WAL_FILE_SYNCED, "rocksdb.wal.synced" } number of writes that request a WAL sync { WAL_FILE_BYTES, "rocksdb.wal.bytes" }, number of bytes written to the WAL { WRITE_DONE_BY_SELF, "rocksdb.write.self" }, number of writes processed by the calling thread { WRITE_DONE_BY_OTHER, "rocksdb.write.other" }, number of writes not processed by the calling thread. Instead these were processed by the current holder of the write lock { WRITE_WITH_WAL, "rocksdb.write.wal" }, number of writes that request WAL logging { COMPACT_READ_BYTES, "rocksdb.compact.read.bytes" }, number of bytes read during compaction { COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes" }, number of bytes written during compaction Per-interval stats output was updated with WAL stats and correct stats for universal compaction including a correct value for write-amplification. It now looks like: Compactions Level Files Size(MB) Score Time(sec) Read(MB) Write(MB) Rn(MB) Rnp1(MB) Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s) Rn Rnp1 Wnp1 NewW Count Ln-stall Stall-cnt -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 0 7 464 46.4 281 3411 3875 3411 0 3875 2.1 12.1 13.8 621 0 240 240 628 0.0 0 Uptime(secs): 310.8 total, 2.0 interval Writes cumulative: 9999999 total, 9999999 batches, 1.0 per batch, 1.22 ingest GB WAL cumulative: 9999999 WAL writes, 9999999 WAL syncs, 1.00 writes per sync, 1.22 GB written Compaction IO cumulative (GB): 1.22 new, 3.33 read, 3.78 write, 7.12 read+write Compaction IO cumulative (MB/sec): 4.0 new, 11.0 read, 12.5 write, 23.4 read+write Amplification cumulative: 4.1 write, 6.8 compaction Writes interval: 100000 total, 100000 batches, 1.0 per batch, 12.5 ingest MB WAL interval: 100000 WAL writes, 100000 WAL syncs, 1.00 writes per sync, 0.01 MB written Compaction IO interval (MB): 12.49 new, 14.98 read, 21.50 write, 36.48 read+write Compaction IO interval (MB/sec): 6.4 new, 7.6 read, 11.0 write, 18.6 read+write Amplification interval: 101.7 write, 102.9 compaction Stalls(secs): 142.924 level0_slowdown, 0.000 level0_numfiles, 0.805 memtable_compaction, 0.000 leveln_slowdown Stalls(count): 132461 level0_slowdown, 0 level0_numfiles, 3 memtable_compaction, 0 leveln_slowdown Task ID: #3329644, #3301695 Blame Rev: Test Plan: Revert Plan: Database Impact: Memcache Impact: Other Notes: EImportant: - begin *PUBLIC* platform impact section - Bugzilla: # - end platform impact - Reviewers: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14583
11 years ago
// Writes can be processed by requesting thread or by the thread at the
// head of the writers queue.
WRITE_DONE_BY_SELF,
support for concurrent adds to memtable Summary: This diff adds support for concurrent adds to the skiplist memtable implementations. Memory allocation is made thread-safe by the addition of a spinlock, with small per-core buffers to avoid contention. Concurrent memtable writes are made via an additional method and don't impose a performance overhead on the non-concurrent case, so parallelism can be selected on a per-batch basis. Write thread synchronization is an increasing bottleneck for higher levels of concurrency, so this diff adds --enable_write_thread_adaptive_yield (default off). This feature causes threads joining a write batch group to spin for a short time (default 100 usec) using sched_yield, rather than going to sleep on a mutex. If the timing of the yield calls indicates that another thread has actually run during the yield then spinning is avoided. This option improves performance for concurrent situations even without parallel adds, although it has the potential to increase CPU usage (and the heuristic adaptation is not yet mature). Parallel writes are not currently compatible with inplace updates, update callbacks, or delete filtering. Enable it with --allow_concurrent_memtable_write (and --enable_write_thread_adaptive_yield). Parallel memtable writes are performance neutral when there is no actual parallelism, and in my experiments (SSD server-class Linux and varying contention and key sizes for fillrandom) they are always a performance win when there is more than one thread. Statistics are updated earlier in the write path, dropping the number of DB mutex acquisitions from 2 to 1 for almost all cases. This diff was motivated and inspired by Yahoo's cLSM work. It is more conservative than cLSM: RocksDB's write batch group leader role is preserved (along with all of the existing flush and write throttling logic) and concurrent writers are blocked until all memtable insertions have completed and the sequence number has been advanced, to preserve linearizability. My test config is "db_bench -benchmarks=fillrandom -threads=$T -batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T -level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999 -disable_auto_compactions --max_write_buffer_number=8 -max_background_flushes=8 --disable_wal --write_buffer_size=160000000 --block_size=16384 --allow_concurrent_memtable_write" on a two-socket Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1 thread I get ~440Kops/sec. Peak performance for 1 socket (numactl -N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance across both sockets happens at 30 threads, and is ~900Kops/sec, although with fewer threads there is less performance loss when the system has background work. Test Plan: 1. concurrent stress tests for InlineSkipList and DynamicBloom 2. make clean; make check 3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench 4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench 5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench 6. make clean; OPT=-DROCKSDB_LITE make check 7. verify no perf regressions when disabled Reviewers: igor, sdong Reviewed By: sdong Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba Differential Revision: https://reviews.facebook.net/D50589
9 years ago
WRITE_DONE_BY_OTHER, // Equivalent to writes done for others
Implement full filter for block based table. Summary: 1. Make filter_block.h a base class. Derive block_based_filter_block and full_filter_block. The previous one is the traditional filter block. The full_filter_block is newly added. It would generate a filter block that contain all the keys in SST file. 2. When querying a key, table would first check if full_filter is available. If not, it would go to the exact data block and check using block_based filter. 3. User could choose to use full_filter or tradional(block_based_filter). They would be stored in SST file with different meta index name. "filter.filter_policy" or "full_filter.filter_policy". Then, Table reader is able to know the fllter block type. 4. Some optimizations have been done for full_filter_block, thus it requires a different interface compared to the original one in filter_policy.h. 5. Actual implementation of filter bits coding/decoding is placed in util/bloom_impl.cc Benchmark: base commit 1d23b5c470844c1208301311f0889eca750431c0 Command: db_bench --db=/dev/shm/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --write_buffer_size=134217728 --max_write_buffer_number=2 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --verify_checksum=false --max_background_compactions=4 --use_plain_table=0 --memtablerep=prefix_hash --open_files=-1 --mmap_read=1 --mmap_write=0 --bloom_bits=10 --bloom_locality=1 --memtable_bloom_bits=500000 --compression_type=lz4 --num=393216000 --use_hash_search=1 --block_size=1024 --block_restart_interval=16 --use_existing_db=1 --threads=1 --benchmarks=readrandom —disable_auto_compactions=1 Read QPS increase for about 30% from 2230002 to 2991411. Test Plan: make all check valgrind db_test db_stress --use_block_based_filter = 0 ./auto_sanity_test.sh Reviewers: igor, yhchiang, ljin, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D20979
10 years ago
WRITE_TIMEDOUT, // Number of writes ending up with timed-out.
WRITE_WITH_WAL, // Number of Write calls that request WAL
COMPACT_READ_BYTES, // Bytes read during compaction
COMPACT_WRITE_BYTES, // Bytes written during compaction
FLUSH_WRITE_BYTES, // Bytes written during flush
Add monitoring for universal compaction and add counters for compaction IO Summary: Adds these counters { WAL_FILE_SYNCED, "rocksdb.wal.synced" } number of writes that request a WAL sync { WAL_FILE_BYTES, "rocksdb.wal.bytes" }, number of bytes written to the WAL { WRITE_DONE_BY_SELF, "rocksdb.write.self" }, number of writes processed by the calling thread { WRITE_DONE_BY_OTHER, "rocksdb.write.other" }, number of writes not processed by the calling thread. Instead these were processed by the current holder of the write lock { WRITE_WITH_WAL, "rocksdb.write.wal" }, number of writes that request WAL logging { COMPACT_READ_BYTES, "rocksdb.compact.read.bytes" }, number of bytes read during compaction { COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes" }, number of bytes written during compaction Per-interval stats output was updated with WAL stats and correct stats for universal compaction including a correct value for write-amplification. It now looks like: Compactions Level Files Size(MB) Score Time(sec) Read(MB) Write(MB) Rn(MB) Rnp1(MB) Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s) Rn Rnp1 Wnp1 NewW Count Ln-stall Stall-cnt -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 0 7 464 46.4 281 3411 3875 3411 0 3875 2.1 12.1 13.8 621 0 240 240 628 0.0 0 Uptime(secs): 310.8 total, 2.0 interval Writes cumulative: 9999999 total, 9999999 batches, 1.0 per batch, 1.22 ingest GB WAL cumulative: 9999999 WAL writes, 9999999 WAL syncs, 1.00 writes per sync, 1.22 GB written Compaction IO cumulative (GB): 1.22 new, 3.33 read, 3.78 write, 7.12 read+write Compaction IO cumulative (MB/sec): 4.0 new, 11.0 read, 12.5 write, 23.4 read+write Amplification cumulative: 4.1 write, 6.8 compaction Writes interval: 100000 total, 100000 batches, 1.0 per batch, 12.5 ingest MB WAL interval: 100000 WAL writes, 100000 WAL syncs, 1.00 writes per sync, 0.01 MB written Compaction IO interval (MB): 12.49 new, 14.98 read, 21.50 write, 36.48 read+write Compaction IO interval (MB/sec): 6.4 new, 7.6 read, 11.0 write, 18.6 read+write Amplification interval: 101.7 write, 102.9 compaction Stalls(secs): 142.924 level0_slowdown, 0.000 level0_numfiles, 0.805 memtable_compaction, 0.000 leveln_slowdown Stalls(count): 132461 level0_slowdown, 0 level0_numfiles, 3 memtable_compaction, 0 leveln_slowdown Task ID: #3329644, #3301695 Blame Rev: Test Plan: Revert Plan: Database Impact: Memcache Impact: Other Notes: EImportant: - begin *PUBLIC* platform impact section - Bugzilla: # - end platform impact - Reviewers: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14583
11 years ago
// Number of table's properties loaded directly from file, without creating
// table reader object.
NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
NUMBER_SUPERVERSION_ACQUIRES,
NUMBER_SUPERVERSION_RELEASES,
NUMBER_SUPERVERSION_CLEANUPS,
NUMBER_BLOCK_NOT_COMPRESSED,
MERGE_OPERATION_TOTAL_TIME,
FILTER_OPERATION_TOTAL_TIME,
// Row cache.
ROW_CACHE_HIT,
ROW_CACHE_MISS,
TICKER_ENUM_MAX
};
// The order of items listed in Tickers should be the same as
// the order listed in TickersNameMap
const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"},
{BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"},
{BLOCK_CACHE_ADD, "rocksdb.block.cache.add"},
{BLOCK_CACHE_ADD_FAILURES, "rocksdb.block.cache.add.failures"},
{BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"},
{BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"},
Add statistics field to show total size of index and filter blocks in block cache Summary: With `table_options.cache_index_and_filter_blocks = true`, index and filter blocks are stored in block cache. Then people are curious how much of the block cache total size is used by indexes and bloom filters. It will be nice we have a way to report that. It can help people tune performance and plan for optimized hardware setting. We add several enum values for db Statistics. BLOCK_CACHE_INDEX/FILTER_BYTES_INSERT - BLOCK_CACHE_INDEX/FILTER_BYTES_ERASE = current INDEX/FILTER total block size in bytes. Test Plan: write a test case called `DBBlockCacheTest.IndexAndFilterBlocksStats`. The result is: ``` [gzh@dev9927.prn1 ~/local/rocksdb] make db_block_cache_test -j64 && ./db_block_cache_test --gtest_filter=DBBlockCacheTest.IndexAndFilterBlocksStats Makefile:101: Warning: Compiling in debug mode. Don't use the resulting binary in production GEN util/build_version.cc make: `db_block_cache_test' is up to date. Note: Google Test filter = DBBlockCacheTest.IndexAndFilterBlocksStats [==========] Running 1 test from 1 test case. [----------] Global test environment set-up. [----------] 1 test from DBBlockCacheTest [ RUN ] DBBlockCacheTest.IndexAndFilterBlocksStats [ OK ] DBBlockCacheTest.IndexAndFilterBlocksStats (689 ms) [----------] 1 test from DBBlockCacheTest (689 ms total) [----------] Global test environment tear-down [==========] 1 test from 1 test case ran. (689 ms total) [ PASSED ] 1 test. ``` Reviewers: IslamAbdelRahman, andrewkr, sdong Reviewed By: sdong Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D58677
9 years ago
{BLOCK_CACHE_INDEX_BYTES_INSERT, "rocksdb.block.cache.index.bytes.insert"},
{BLOCK_CACHE_INDEX_BYTES_EVICT, "rocksdb.block.cache.index.bytes.evict"},
{BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"},
{BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
Add statistics field to show total size of index and filter blocks in block cache Summary: With `table_options.cache_index_and_filter_blocks = true`, index and filter blocks are stored in block cache. Then people are curious how much of the block cache total size is used by indexes and bloom filters. It will be nice we have a way to report that. It can help people tune performance and plan for optimized hardware setting. We add several enum values for db Statistics. BLOCK_CACHE_INDEX/FILTER_BYTES_INSERT - BLOCK_CACHE_INDEX/FILTER_BYTES_ERASE = current INDEX/FILTER total block size in bytes. Test Plan: write a test case called `DBBlockCacheTest.IndexAndFilterBlocksStats`. The result is: ``` [gzh@dev9927.prn1 ~/local/rocksdb] make db_block_cache_test -j64 && ./db_block_cache_test --gtest_filter=DBBlockCacheTest.IndexAndFilterBlocksStats Makefile:101: Warning: Compiling in debug mode. Don't use the resulting binary in production GEN util/build_version.cc make: `db_block_cache_test' is up to date. Note: Google Test filter = DBBlockCacheTest.IndexAndFilterBlocksStats [==========] Running 1 test from 1 test case. [----------] Global test environment set-up. [----------] 1 test from DBBlockCacheTest [ RUN ] DBBlockCacheTest.IndexAndFilterBlocksStats [ OK ] DBBlockCacheTest.IndexAndFilterBlocksStats (689 ms) [----------] 1 test from DBBlockCacheTest (689 ms total) [----------] Global test environment tear-down [==========] 1 test from 1 test case ran. (689 ms total) [ PASSED ] 1 test. ``` Reviewers: IslamAbdelRahman, andrewkr, sdong Reviewed By: sdong Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D58677
9 years ago
{BLOCK_CACHE_FILTER_BYTES_INSERT,
"rocksdb.block.cache.filter.bytes.insert"},
{BLOCK_CACHE_FILTER_BYTES_EVICT, "rocksdb.block.cache.filter.bytes.evict"},
{BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
{BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
{BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"},
{BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"},
{BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
{MEMTABLE_HIT, "rocksdb.memtable.hit"},
{MEMTABLE_MISS, "rocksdb.memtable.miss"},
{GET_HIT_L0, "rocksdb.l0.hit"},
{GET_HIT_L1, "rocksdb.l1.hit"},
{GET_HIT_L2_AND_UP, "rocksdb.l2andup.hit"},
{COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
{COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
{COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
{NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
{NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
{NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
{BYTES_WRITTEN, "rocksdb.bytes.written"},
{BYTES_READ, "rocksdb.bytes.read"},
{NUMBER_DB_SEEK, "rocksdb.number.db.seek"},
{NUMBER_DB_NEXT, "rocksdb.number.db.next"},
{NUMBER_DB_PREV, "rocksdb.number.db.prev"},
{NUMBER_DB_SEEK_FOUND, "rocksdb.number.db.seek.found"},
{NUMBER_DB_NEXT_FOUND, "rocksdb.number.db.next.found"},
{NUMBER_DB_PREV_FOUND, "rocksdb.number.db.prev.found"},
{ITER_BYTES_READ, "rocksdb.db.iter.bytes.read"},
{NO_FILE_CLOSES, "rocksdb.no.file.closes"},
{NO_FILE_OPENS, "rocksdb.no.file.opens"},
{NO_FILE_ERRORS, "rocksdb.no.file.errors"},
{STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
{STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
{STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
{STALL_MICROS, "rocksdb.stall.micros"},
{DB_MUTEX_WAIT_MICROS, "rocksdb.db.mutex.wait.micros"},
{RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
{NO_ITERATORS, "rocksdb.num.iterators"},
{NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
{NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"},
{NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
{NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"},
{NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
{SEQUENCE_NUMBER, "rocksdb.sequence.number"},
{BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
{BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
{NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
{GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"},
{BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"},
{BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"},
{BLOCK_CACHE_COMPRESSED_ADD, "rocksdb.block.cachecompressed.add"},
{BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
"rocksdb.block.cachecompressed.add.failures"},
{WAL_FILE_SYNCED, "rocksdb.wal.synced"},
{WAL_FILE_BYTES, "rocksdb.wal.bytes"},
{WRITE_DONE_BY_SELF, "rocksdb.write.self"},
{WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
{WRITE_WITH_WAL, "rocksdb.write.wal"},
{FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"},
{COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
{COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
{NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
"rocksdb.number.direct.load.table.properties"},
{NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
{NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
{NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
{NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"},
{MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"},
{FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"},
{ROW_CACHE_HIT, "rocksdb.row.cache.hit"},
{ROW_CACHE_MISS, "rocksdb.row.cache.miss"},
};
/**
* Keep adding histogram's here.
* Any histogram whould have value less than HISTOGRAM_ENUM_MAX
* Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
* Add a string representation in HistogramsNameMap below
* And increment HISTOGRAM_ENUM_MAX
*/
enum Histograms : uint32_t {
DB_GET = 0,
DB_WRITE,
COMPACTION_TIME,
SUBCOMPACTION_SETUP_TIME,
TABLE_SYNC_MICROS,
COMPACTION_OUTFILE_SYNC_MICROS,
WAL_FILE_SYNC_MICROS,
MANIFEST_FILE_SYNC_MICROS,
// TIME SPENT IN IO DURING TABLE OPEN
TABLE_OPEN_IO_MICROS,
DB_MULTIGET,
READ_BLOCK_COMPACTION_MICROS,
READ_BLOCK_GET_MICROS,
WRITE_RAW_BLOCK_MICROS,
STALL_L0_SLOWDOWN_COUNT,
STALL_MEMTABLE_COMPACTION_COUNT,
STALL_L0_NUM_FILES_COUNT,
HARD_RATE_LIMIT_DELAY_COUNT,
SOFT_RATE_LIMIT_DELAY_COUNT,
NUM_FILES_IN_SINGLE_COMPACTION,
DB_SEEK,
WRITE_STALL,
SST_READ_MICROS,
// The number of subcompactions actually scheduled during a compaction
NUM_SUBCOMPACTIONS_SCHEDULED,
// Value size distribution in each operation
BYTES_PER_READ,
BYTES_PER_WRITE,
BYTES_PER_MULTIGET,
HISTOGRAM_ENUM_MAX, // TODO(ldemailly): enforce HistogramsNameMap match
};
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
{DB_GET, "rocksdb.db.get.micros"},
{DB_WRITE, "rocksdb.db.write.micros"},
{COMPACTION_TIME, "rocksdb.compaction.times.micros"},
{SUBCOMPACTION_SETUP_TIME, "rocksdb.subcompaction.setup.times.micros"},
{TABLE_SYNC_MICROS, "rocksdb.table.sync.micros"},
{COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros"},
{WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros"},
{MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros"},
{TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros"},
{DB_MULTIGET, "rocksdb.db.multiget.micros"},
{READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros"},
{READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros"},
{WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros"},
{STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
{STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
{STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
{HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
{SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
{NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction"},
{DB_SEEK, "rocksdb.db.seek.micros"},
{WRITE_STALL, "rocksdb.db.write.stall"},
{SST_READ_MICROS, "rocksdb.sst.read.micros"},
{NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"},
{BYTES_PER_READ, "rocksdb.bytes.per.read"},
{BYTES_PER_WRITE, "rocksdb.bytes.per.write"},
{BYTES_PER_MULTIGET, "rocksdb.bytes.per.multiget"},
};
struct HistogramData {
double median;
double percentile95;
double percentile99;
double average;
double standard_deviation;
};
enum StatsLevel {
// Collect all stats except the counters requiring to get time inside the
// mutex lock.
kExceptTimeForMutex,
// Collect all stats, including measuring duration of mutex operations.
// If getting time is expensive on the platform to run, it can
// reduce scalability to more threads, especialy for writes.
kAll,
};
// Analyze the performance of a db
class Statistics {
public:
virtual ~Statistics() {}
virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
virtual void histogramData(uint32_t type,
HistogramData* const data) const = 0;
virtual std::string getHistogramString(uint32_t type) const { return ""; }
virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0;
virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0;
virtual void measureTime(uint32_t histogramType, uint64_t time) = 0;
// String representation of the statistic object.
virtual std::string ToString() const {
// Do nothing by default
return std::string("ToString(): not implemented");
}
// Override this function to disable particular histogram collection
virtual bool HistEnabledForType(uint32_t type) const {
return type < HISTOGRAM_ENUM_MAX;
}
StatsLevel stats_level_ = kExceptTimeForMutex;
};
// Create a concrete DBStatistics object
std::shared_ptr<Statistics> CreateDBStatistics();
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_