|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
#include "monitoring/statistics.h"
|
|
|
|
|
|
|
|
#include <cinttypes>
|
|
|
|
#include "rocksdb/statistics.h"
|
|
|
|
#include "port/likely.h"
|
|
|
|
#include <algorithm>
|
|
|
|
#include <cstdio>
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
// The order of items listed in Tickers should be the same as
|
|
|
|
// the order listed in TickersNameMap
|
|
|
|
const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
|
|
|
|
{BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"},
|
|
|
|
{BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"},
|
|
|
|
{BLOCK_CACHE_ADD, "rocksdb.block.cache.add"},
|
|
|
|
{BLOCK_CACHE_ADD_FAILURES, "rocksdb.block.cache.add.failures"},
|
|
|
|
{BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"},
|
|
|
|
{BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"},
|
|
|
|
{BLOCK_CACHE_INDEX_ADD, "rocksdb.block.cache.index.add"},
|
|
|
|
{BLOCK_CACHE_INDEX_BYTES_INSERT, "rocksdb.block.cache.index.bytes.insert"},
|
|
|
|
{BLOCK_CACHE_INDEX_BYTES_EVICT, "rocksdb.block.cache.index.bytes.evict"},
|
|
|
|
{BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"},
|
|
|
|
{BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
|
|
|
|
{BLOCK_CACHE_FILTER_ADD, "rocksdb.block.cache.filter.add"},
|
|
|
|
{BLOCK_CACHE_FILTER_BYTES_INSERT,
|
|
|
|
"rocksdb.block.cache.filter.bytes.insert"},
|
|
|
|
{BLOCK_CACHE_FILTER_BYTES_EVICT, "rocksdb.block.cache.filter.bytes.evict"},
|
|
|
|
{BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
|
|
|
|
{BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
|
|
|
|
{BLOCK_CACHE_DATA_ADD, "rocksdb.block.cache.data.add"},
|
|
|
|
{BLOCK_CACHE_DATA_BYTES_INSERT, "rocksdb.block.cache.data.bytes.insert"},
|
|
|
|
{BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"},
|
|
|
|
{BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"},
|
|
|
|
{BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
|
|
|
|
{BLOOM_FILTER_FULL_POSITIVE, "rocksdb.bloom.filter.full.positive"},
|
|
|
|
{BLOOM_FILTER_FULL_TRUE_POSITIVE,
|
|
|
|
"rocksdb.bloom.filter.full.true.positive"},
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
|
|
|
{BLOOM_FILTER_MICROS, "rocksdb.bloom.filter.micros"},
|
|
|
|
{PERSISTENT_CACHE_HIT, "rocksdb.persistent.cache.hit"},
|
|
|
|
{PERSISTENT_CACHE_MISS, "rocksdb.persistent.cache.miss"},
|
|
|
|
{SIM_BLOCK_CACHE_HIT, "rocksdb.sim.block.cache.hit"},
|
|
|
|
{SIM_BLOCK_CACHE_MISS, "rocksdb.sim.block.cache.miss"},
|
|
|
|
{MEMTABLE_HIT, "rocksdb.memtable.hit"},
|
|
|
|
{MEMTABLE_MISS, "rocksdb.memtable.miss"},
|
|
|
|
{GET_HIT_L0, "rocksdb.l0.hit"},
|
|
|
|
{GET_HIT_L1, "rocksdb.l1.hit"},
|
|
|
|
{GET_HIT_L2_AND_UP, "rocksdb.l2andup.hit"},
|
|
|
|
{COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
|
|
|
|
{COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
|
|
|
|
{COMPACTION_KEY_DROP_RANGE_DEL, "rocksdb.compaction.key.drop.range_del"},
|
|
|
|
{COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
|
|
|
|
{COMPACTION_RANGE_DEL_DROP_OBSOLETE,
|
|
|
|
"rocksdb.compaction.range_del.drop.obsolete"},
|
|
|
|
{COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
|
|
|
|
"rocksdb.compaction.optimized.del.drop.obsolete"},
|
|
|
|
{COMPACTION_CANCELLED, "rocksdb.compaction.cancelled"},
|
|
|
|
{NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
|
|
|
|
{NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
|
|
|
|
{NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
|
|
|
|
{BYTES_WRITTEN, "rocksdb.bytes.written"},
|
|
|
|
{BYTES_READ, "rocksdb.bytes.read"},
|
|
|
|
{NUMBER_DB_SEEK, "rocksdb.number.db.seek"},
|
|
|
|
{NUMBER_DB_NEXT, "rocksdb.number.db.next"},
|
|
|
|
{NUMBER_DB_PREV, "rocksdb.number.db.prev"},
|
|
|
|
{NUMBER_DB_SEEK_FOUND, "rocksdb.number.db.seek.found"},
|
|
|
|
{NUMBER_DB_NEXT_FOUND, "rocksdb.number.db.next.found"},
|
|
|
|
{NUMBER_DB_PREV_FOUND, "rocksdb.number.db.prev.found"},
|
|
|
|
{ITER_BYTES_READ, "rocksdb.db.iter.bytes.read"},
|
|
|
|
{NO_FILE_CLOSES, "rocksdb.no.file.closes"},
|
|
|
|
{NO_FILE_OPENS, "rocksdb.no.file.opens"},
|
|
|
|
{NO_FILE_ERRORS, "rocksdb.no.file.errors"},
|
|
|
|
{STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
|
|
|
|
{STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
|
|
|
|
{STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
|
|
|
|
{STALL_MICROS, "rocksdb.stall.micros"},
|
|
|
|
{DB_MUTEX_WAIT_MICROS, "rocksdb.db.mutex.wait.micros"},
|
|
|
|
{RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
|
|
|
|
{NO_ITERATORS, "rocksdb.num.iterators"},
|
|
|
|
{NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
|
|
|
|
{NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"},
|
|
|
|
{NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
|
|
|
|
{NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"},
|
|
|
|
{NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
|
|
|
|
{BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
|
|
|
|
{BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
|
|
|
|
{NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
|
|
|
|
{GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"},
|
|
|
|
{BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"},
|
|
|
|
{BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"},
|
|
|
|
{BLOCK_CACHE_COMPRESSED_ADD, "rocksdb.block.cachecompressed.add"},
|
|
|
|
{BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
|
|
|
|
"rocksdb.block.cachecompressed.add.failures"},
|
|
|
|
{WAL_FILE_SYNCED, "rocksdb.wal.synced"},
|
|
|
|
{WAL_FILE_BYTES, "rocksdb.wal.bytes"},
|
|
|
|
{WRITE_DONE_BY_SELF, "rocksdb.write.self"},
|
|
|
|
{WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
|
|
|
|
{WRITE_TIMEDOUT, "rocksdb.write.timeout"},
|
|
|
|
{WRITE_WITH_WAL, "rocksdb.write.wal"},
|
|
|
|
{COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
|
|
|
|
{COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
|
|
|
|
{FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"},
|
|
|
|
{NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
|
|
|
|
"rocksdb.number.direct.load.table.properties"},
|
|
|
|
{NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
|
|
|
|
{NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
|
|
|
|
{NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
|
|
|
|
{NUMBER_BLOCK_COMPRESSED, "rocksdb.number.block.compressed"},
|
|
|
|
{NUMBER_BLOCK_DECOMPRESSED, "rocksdb.number.block.decompressed"},
|
|
|
|
{NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"},
|
|
|
|
{MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"},
|
|
|
|
{FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"},
|
|
|
|
{ROW_CACHE_HIT, "rocksdb.row.cache.hit"},
|
|
|
|
{ROW_CACHE_MISS, "rocksdb.row.cache.miss"},
|
|
|
|
{READ_AMP_ESTIMATE_USEFUL_BYTES, "rocksdb.read.amp.estimate.useful.bytes"},
|
|
|
|
{READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"},
|
|
|
|
{NUMBER_RATE_LIMITER_DRAINS, "rocksdb.number.rate_limiter.drains"},
|
|
|
|
{NUMBER_ITER_SKIP, "rocksdb.number.iter.skip"},
|
|
|
|
{BLOB_DB_NUM_PUT, "rocksdb.blobdb.num.put"},
|
|
|
|
{BLOB_DB_NUM_WRITE, "rocksdb.blobdb.num.write"},
|
|
|
|
{BLOB_DB_NUM_GET, "rocksdb.blobdb.num.get"},
|
|
|
|
{BLOB_DB_NUM_MULTIGET, "rocksdb.blobdb.num.multiget"},
|
|
|
|
{BLOB_DB_NUM_SEEK, "rocksdb.blobdb.num.seek"},
|
|
|
|
{BLOB_DB_NUM_NEXT, "rocksdb.blobdb.num.next"},
|
|
|
|
{BLOB_DB_NUM_PREV, "rocksdb.blobdb.num.prev"},
|
|
|
|
{BLOB_DB_NUM_KEYS_WRITTEN, "rocksdb.blobdb.num.keys.written"},
|
|
|
|
{BLOB_DB_NUM_KEYS_READ, "rocksdb.blobdb.num.keys.read"},
|
|
|
|
{BLOB_DB_BYTES_WRITTEN, "rocksdb.blobdb.bytes.written"},
|
|
|
|
{BLOB_DB_BYTES_READ, "rocksdb.blobdb.bytes.read"},
|
|
|
|
{BLOB_DB_WRITE_INLINED, "rocksdb.blobdb.write.inlined"},
|
|
|
|
{BLOB_DB_WRITE_INLINED_TTL, "rocksdb.blobdb.write.inlined.ttl"},
|
|
|
|
{BLOB_DB_WRITE_BLOB, "rocksdb.blobdb.write.blob"},
|
|
|
|
{BLOB_DB_WRITE_BLOB_TTL, "rocksdb.blobdb.write.blob.ttl"},
|
|
|
|
{BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"},
|
|
|
|
{BLOB_DB_BLOB_FILE_BYTES_READ, "rocksdb.blobdb.blob.file.bytes.read"},
|
|
|
|
{BLOB_DB_BLOB_FILE_SYNCED, "rocksdb.blobdb.blob.file.synced"},
|
|
|
|
{BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
|
|
|
|
"rocksdb.blobdb.blob.index.expired.count"},
|
|
|
|
{BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, "rocksdb.blobdb.blob.index.expired.size"},
|
|
|
|
{BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
|
|
|
|
"rocksdb.blobdb.blob.index.evicted.count"},
|
|
|
|
{BLOB_DB_BLOB_INDEX_EVICTED_SIZE, "rocksdb.blobdb.blob.index.evicted.size"},
|
|
|
|
{BLOB_DB_GC_NUM_FILES, "rocksdb.blobdb.gc.num.files"},
|
|
|
|
{BLOB_DB_GC_NUM_NEW_FILES, "rocksdb.blobdb.gc.num.new.files"},
|
|
|
|
{BLOB_DB_GC_FAILURES, "rocksdb.blobdb.gc.failures"},
|
|
|
|
{BLOB_DB_GC_NUM_KEYS_OVERWRITTEN, "rocksdb.blobdb.gc.num.keys.overwritten"},
|
|
|
|
{BLOB_DB_GC_NUM_KEYS_EXPIRED, "rocksdb.blobdb.gc.num.keys.expired"},
|
|
|
|
{BLOB_DB_GC_NUM_KEYS_RELOCATED, "rocksdb.blobdb.gc.num.keys.relocated"},
|
|
|
|
{BLOB_DB_GC_BYTES_OVERWRITTEN, "rocksdb.blobdb.gc.bytes.overwritten"},
|
|
|
|
{BLOB_DB_GC_BYTES_EXPIRED, "rocksdb.blobdb.gc.bytes.expired"},
|
|
|
|
{BLOB_DB_GC_BYTES_RELOCATED, "rocksdb.blobdb.gc.bytes.relocated"},
|
|
|
|
{BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"},
|
|
|
|
{BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"},
|
|
|
|
{BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"},
|
|
|
|
{TXN_PREPARE_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.prepare"},
|
|
|
|
{TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
|
|
|
|
"rocksdb.txn.overhead.mutex.old.commit.map"},
|
|
|
|
{TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"},
|
|
|
|
{TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"},
|
|
|
|
{NUMBER_MULTIGET_KEYS_FOUND, "rocksdb.number.multiget.keys.found"},
|
|
|
|
{NO_ITERATOR_CREATED, "rocksdb.num.iterator.created"},
|
|
|
|
{NO_ITERATOR_DELETED, "rocksdb.num.iterator.deleted"},
|
|
|
|
{BLOCK_CACHE_COMPRESSION_DICT_MISS,
|
|
|
|
"rocksdb.block.cache.compression.dict.miss"},
|
|
|
|
{BLOCK_CACHE_COMPRESSION_DICT_HIT,
|
|
|
|
"rocksdb.block.cache.compression.dict.hit"},
|
|
|
|
{BLOCK_CACHE_COMPRESSION_DICT_ADD,
|
|
|
|
"rocksdb.block.cache.compression.dict.add"},
|
|
|
|
{BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
|
|
|
|
"rocksdb.block.cache.compression.dict.bytes.insert"},
|
|
|
|
{BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
|
|
|
|
"rocksdb.block.cache.compression.dict.bytes.evict"},
|
|
|
|
};
|
|
|
|
|
|
|
|
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
|
|
|
|
{DB_GET, "rocksdb.db.get.micros"},
|
|
|
|
{DB_WRITE, "rocksdb.db.write.micros"},
|
|
|
|
{COMPACTION_TIME, "rocksdb.compaction.times.micros"},
|
|
|
|
{COMPACTION_CPU_TIME, "rocksdb.compaction.times.cpu_micros"},
|
|
|
|
{SUBCOMPACTION_SETUP_TIME, "rocksdb.subcompaction.setup.times.micros"},
|
|
|
|
{TABLE_SYNC_MICROS, "rocksdb.table.sync.micros"},
|
|
|
|
{COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros"},
|
|
|
|
{WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros"},
|
|
|
|
{MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros"},
|
|
|
|
{TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros"},
|
|
|
|
{DB_MULTIGET, "rocksdb.db.multiget.micros"},
|
|
|
|
{READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros"},
|
|
|
|
{READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros"},
|
|
|
|
{WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros"},
|
|
|
|
{STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
|
|
|
|
{STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
|
|
|
|
{STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
|
|
|
|
{HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
|
|
|
|
{SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
|
|
|
|
{NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction"},
|
|
|
|
{DB_SEEK, "rocksdb.db.seek.micros"},
|
|
|
|
{WRITE_STALL, "rocksdb.db.write.stall"},
|
|
|
|
{SST_READ_MICROS, "rocksdb.sst.read.micros"},
|
|
|
|
{NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"},
|
|
|
|
{BYTES_PER_READ, "rocksdb.bytes.per.read"},
|
|
|
|
{BYTES_PER_WRITE, "rocksdb.bytes.per.write"},
|
|
|
|
{BYTES_PER_MULTIGET, "rocksdb.bytes.per.multiget"},
|
|
|
|
{BYTES_COMPRESSED, "rocksdb.bytes.compressed"},
|
|
|
|
{BYTES_DECOMPRESSED, "rocksdb.bytes.decompressed"},
|
|
|
|
{COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"},
|
|
|
|
{DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"},
|
|
|
|
{READ_NUM_MERGE_OPERANDS, "rocksdb.read.num.merge_operands"},
|
|
|
|
{BLOB_DB_KEY_SIZE, "rocksdb.blobdb.key.size"},
|
|
|
|
{BLOB_DB_VALUE_SIZE, "rocksdb.blobdb.value.size"},
|
|
|
|
{BLOB_DB_WRITE_MICROS, "rocksdb.blobdb.write.micros"},
|
|
|
|
{BLOB_DB_GET_MICROS, "rocksdb.blobdb.get.micros"},
|
|
|
|
{BLOB_DB_MULTIGET_MICROS, "rocksdb.blobdb.multiget.micros"},
|
|
|
|
{BLOB_DB_SEEK_MICROS, "rocksdb.blobdb.seek.micros"},
|
|
|
|
{BLOB_DB_NEXT_MICROS, "rocksdb.blobdb.next.micros"},
|
|
|
|
{BLOB_DB_PREV_MICROS, "rocksdb.blobdb.prev.micros"},
|
|
|
|
{BLOB_DB_BLOB_FILE_WRITE_MICROS, "rocksdb.blobdb.blob.file.write.micros"},
|
|
|
|
{BLOB_DB_BLOB_FILE_READ_MICROS, "rocksdb.blobdb.blob.file.read.micros"},
|
|
|
|
{BLOB_DB_BLOB_FILE_SYNC_MICROS, "rocksdb.blobdb.blob.file.sync.micros"},
|
|
|
|
{BLOB_DB_GC_MICROS, "rocksdb.blobdb.gc.micros"},
|
|
|
|
{BLOB_DB_COMPRESSION_MICROS, "rocksdb.blobdb.compression.micros"},
|
|
|
|
{BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"},
|
|
|
|
{FLUSH_TIME, "rocksdb.db.flush.micros"},
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
|
|
|
{SST_BATCH_SIZE, "rocksdb.sst.batch.size"},
|
|
|
|
};
|
|
|
|
|
|
|
|
std::shared_ptr<Statistics> CreateDBStatistics() {
|
|
|
|
return std::make_shared<StatisticsImpl>(nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
StatisticsImpl::StatisticsImpl(std::shared_ptr<Statistics> stats)
|
|
|
|
: stats_(std::move(stats)) {}
|
|
|
|
|
|
|
|
StatisticsImpl::~StatisticsImpl() {}
|
|
|
|
|
|
|
|
uint64_t StatisticsImpl::getTickerCount(uint32_t tickerType) const {
|
|
|
|
MutexLock lock(&aggregate_lock_);
|
|
|
|
return getTickerCountLocked(tickerType);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t StatisticsImpl::getTickerCountLocked(uint32_t tickerType) const {
|
|
|
|
assert(tickerType < TICKER_ENUM_MAX);
|
|
|
|
uint64_t res = 0;
|
|
|
|
for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
|
|
|
|
res += per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType];
|
|
|
|
}
|
|
|
|
return res;
|
Thread-specific histogram statistics
Summary:
To reduce contention for atomics when HistogramStats are shared across
threads, this diff makes them thread-specific so updates are faster. This comes
at the expense of slower reads (much less frequent), which now require merging
all histograms. In this diff,
- Thread-specific HistogramImpl is created upon the thread's first measureTime()
- Thread-specific HistogramImpl are merged and deleted upon thread termination or ThreadLocalPtr destruction, whichever comes first
- getHistogramString() and histogramData() merge all histograms, both thread-specific and previously merged ones
Test Plan:
unit tests, ran db_bench and verified histograms look similar
before:
$ TEST_TMPDIR=/dev/shm/ perf record -g ./db_bench --benchmarks=readwhilewriting --statistics --num=1000000 --use_existing_db --threads=64 --cache_size=250000000 --compression_type=lz4
...
+ 7.63% db_bench db_bench [.] rocksdb::HistogramStat::Add
after:
$ TEST_TMPDIR=/dev/shm/ perf record -g ./db_bench --benchmarks=readwhilewriting --statistics --num=1000000 --use_existing_db --threads=64 --cache_size=250000000 --compression_type=lz4
...
+ 0.98% db_bench db_bench [.] rocksdb::HistogramStat::Add
Reviewers: sdong, MarkCallaghan, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: andrewkr, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D62649
9 years ago
|
|
|
}
|
|
|
|
|
|
|
|
void StatisticsImpl::histogramData(uint32_t histogramType,
|
|
|
|
HistogramData* const data) const {
|
|
|
|
MutexLock lock(&aggregate_lock_);
|
|
|
|
getHistogramImplLocked(histogramType)->Data(data);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::unique_ptr<HistogramImpl> StatisticsImpl::getHistogramImplLocked(
|
|
|
|
uint32_t histogramType) const {
|
|
|
|
assert(histogramType < HISTOGRAM_ENUM_MAX);
|
|
|
|
std::unique_ptr<HistogramImpl> res_hist(new HistogramImpl());
|
|
|
|
for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
|
|
|
|
res_hist->Merge(
|
|
|
|
per_core_stats_.AccessAtCore(core_idx)->histograms_[histogramType]);
|
|
|
|
}
|
|
|
|
return res_hist;
|
|
|
|
}
|
|
|
|
|
Add Statistics.getHistogramString() to print more detailed outputs of a histogram
Summary:
Provide a way for users to know more detailed ditribution of a histogram metrics. Example outputs:
Manually add statement
fprintf(stdout, "%s\n", dbstats->getHistogramString(SST_READ_MICROS).c_str());
Will print out something like:
Count: 989151 Average: 1.7659 StdDev: 1.52
Min: 0.0000 Median: 1.2071 Max: 860.0000
Percentiles: P50: 1.21 P75: 1.70 P99: 5.12 P99.9: 13.67 P99.99: 21.70
------------------------------------------------------
[ 0, 1 ) 390839 39.513% 39.513% ########
[ 1, 2 ) 500918 50.641% 90.154% ##########
[ 2, 3 ) 79358 8.023% 98.177% ##
[ 3, 4 ) 6297 0.637% 98.813%
[ 4, 5 ) 1712 0.173% 98.986%
[ 5, 6 ) 1134 0.115% 99.101%
[ 6, 7 ) 1222 0.124% 99.224%
[ 7, 8 ) 1529 0.155% 99.379%
[ 8, 9 ) 1264 0.128% 99.507%
[ 9, 10 ) 988 0.100% 99.607%
[ 10, 12 ) 1378 0.139% 99.746%
[ 12, 14 ) 1828 0.185% 99.931%
[ 14, 16 ) 410 0.041% 99.972%
[ 16, 18 ) 72 0.007% 99.980%
[ 18, 20 ) 67 0.007% 99.986%
[ 20, 25 ) 106 0.011% 99.997%
[ 25, 30 ) 24 0.002% 99.999%
[ 30, 35 ) 1 0.000% 100.000%
[ 250, 300 ) 2 0.000% 100.000%
[ 300, 350 ) 1 0.000% 100.000%
[ 800, 900 ) 1 0.000% 100.000%
Test Plan: Manually add a print in db_bench and make sure it prints out as expected. Will add some codes to cover the function
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D43611
10 years ago
|
|
|
std::string StatisticsImpl::getHistogramString(uint32_t histogramType) const {
|
|
|
|
MutexLock lock(&aggregate_lock_);
|
|
|
|
return getHistogramImplLocked(histogramType)->ToString();
|
Thread-specific histogram statistics
Summary:
To reduce contention for atomics when HistogramStats are shared across
threads, this diff makes them thread-specific so updates are faster. This comes
at the expense of slower reads (much less frequent), which now require merging
all histograms. In this diff,
- Thread-specific HistogramImpl is created upon the thread's first measureTime()
- Thread-specific HistogramImpl are merged and deleted upon thread termination or ThreadLocalPtr destruction, whichever comes first
- getHistogramString() and histogramData() merge all histograms, both thread-specific and previously merged ones
Test Plan:
unit tests, ran db_bench and verified histograms look similar
before:
$ TEST_TMPDIR=/dev/shm/ perf record -g ./db_bench --benchmarks=readwhilewriting --statistics --num=1000000 --use_existing_db --threads=64 --cache_size=250000000 --compression_type=lz4
...
+ 7.63% db_bench db_bench [.] rocksdb::HistogramStat::Add
after:
$ TEST_TMPDIR=/dev/shm/ perf record -g ./db_bench --benchmarks=readwhilewriting --statistics --num=1000000 --use_existing_db --threads=64 --cache_size=250000000 --compression_type=lz4
...
+ 0.98% db_bench db_bench [.] rocksdb::HistogramStat::Add
Reviewers: sdong, MarkCallaghan, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: andrewkr, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D62649
9 years ago
|
|
|
}
|
|
|
|
|
|
|
|
void StatisticsImpl::setTickerCount(uint32_t tickerType, uint64_t count) {
|
|
|
|
{
|
|
|
|
MutexLock lock(&aggregate_lock_);
|
|
|
|
setTickerCountLocked(tickerType, count);
|
|
|
|
}
|
|
|
|
if (stats_ && tickerType < TICKER_ENUM_MAX) {
|
|
|
|
stats_->setTickerCount(tickerType, count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void StatisticsImpl::setTickerCountLocked(uint32_t tickerType, uint64_t count) {
|
|
|
|
assert(tickerType < TICKER_ENUM_MAX);
|
|
|
|
for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
|
|
|
|
if (core_idx == 0) {
|
|
|
|
per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = count;
|
|
|
|
} else {
|
|
|
|
per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t StatisticsImpl::getAndResetTickerCount(uint32_t tickerType) {
|
|
|
|
uint64_t sum = 0;
|
|
|
|
{
|
|
|
|
MutexLock lock(&aggregate_lock_);
|
|
|
|
assert(tickerType < TICKER_ENUM_MAX);
|
|
|
|
for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
|
|
|
|
sum +=
|
|
|
|
per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType].exchange(
|
|
|
|
0, std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (stats_ && tickerType < TICKER_ENUM_MAX) {
|
|
|
|
stats_->setTickerCount(tickerType, 0);
|
|
|
|
}
|
|
|
|
return sum;
|
|
|
|
}
|
|
|
|
|
|
|
|
void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) {
|
|
|
|
assert(tickerType < TICKER_ENUM_MAX);
|
|
|
|
per_core_stats_.Access()->tickers_[tickerType].fetch_add(
|
|
|
|
count, std::memory_order_relaxed);
|
|
|
|
if (stats_ && tickerType < TICKER_ENUM_MAX) {
|
|
|
|
stats_->recordTick(tickerType, count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void StatisticsImpl::recordInHistogram(uint32_t histogramType, uint64_t value) {
|
|
|
|
assert(histogramType < HISTOGRAM_ENUM_MAX);
|
|
|
|
if (get_stats_level() <= StatsLevel::kExceptHistogramOrTimers) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
per_core_stats_.Access()->histograms_[histogramType].Add(value);
|
|
|
|
if (stats_ && histogramType < HISTOGRAM_ENUM_MAX) {
|
|
|
|
stats_->recordInHistogram(histogramType, value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status StatisticsImpl::Reset() {
|
|
|
|
MutexLock lock(&aggregate_lock_);
|
|
|
|
for (uint32_t i = 0; i < TICKER_ENUM_MAX; ++i) {
|
|
|
|
setTickerCountLocked(i, 0);
|
|
|
|
}
|
|
|
|
for (uint32_t i = 0; i < HISTOGRAM_ENUM_MAX; ++i) {
|
|
|
|
for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
|
|
|
|
per_core_stats_.AccessAtCore(core_idx)->histograms_[i].Clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
// a buffer size used for temp string buffers
|
|
|
|
const int kTmpStrBufferSize = 200;
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
std::string StatisticsImpl::ToString() const {
|
|
|
|
MutexLock lock(&aggregate_lock_);
|
|
|
|
std::string res;
|
|
|
|
res.reserve(20000);
|
|
|
|
for (const auto& t : TickersNameMap) {
|
|
|
|
assert(t.first < TICKER_ENUM_MAX);
|
|
|
|
char buffer[kTmpStrBufferSize];
|
|
|
|
snprintf(buffer, kTmpStrBufferSize, "%s COUNT : %" PRIu64 "\n",
|
|
|
|
t.second.c_str(), getTickerCountLocked(t.first));
|
|
|
|
res.append(buffer);
|
|
|
|
}
|
|
|
|
for (const auto& h : HistogramsNameMap) {
|
|
|
|
assert(h.first < HISTOGRAM_ENUM_MAX);
|
|
|
|
char buffer[kTmpStrBufferSize];
|
|
|
|
HistogramData hData;
|
|
|
|
getHistogramImplLocked(h.first)->Data(&hData);
|
|
|
|
// don't handle failures - buffer should always be big enough and arguments
|
|
|
|
// should be provided correctly
|
|
|
|
int ret =
|
|
|
|
snprintf(buffer, kTmpStrBufferSize,
|
|
|
|
"%s P50 : %f P95 : %f P99 : %f P100 : %f COUNT : %" PRIu64
|
|
|
|
" SUM : %" PRIu64 "\n",
|
|
|
|
h.second.c_str(), hData.median, hData.percentile95,
|
|
|
|
hData.percentile99, hData.max, hData.count, hData.sum);
|
|
|
|
if (ret < 0 || ret >= kTmpStrBufferSize) {
|
|
|
|
assert(false);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
res.append(buffer);
|
|
|
|
}
|
|
|
|
res.shrink_to_fit();
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool StatisticsImpl::getTickerMap(
|
|
|
|
std::map<std::string, uint64_t>* stats_map) const {
|
|
|
|
assert(stats_map);
|
|
|
|
if (!stats_map) return false;
|
|
|
|
stats_map->clear();
|
|
|
|
MutexLock lock(&aggregate_lock_);
|
|
|
|
for (const auto& t : TickersNameMap) {
|
|
|
|
assert(t.first < TICKER_ENUM_MAX);
|
|
|
|
(*stats_map)[t.second.c_str()] = getTickerCountLocked(t.first);
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool StatisticsImpl::HistEnabledForType(uint32_t type) const {
|
|
|
|
return type < HISTOGRAM_ENUM_MAX;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace rocksdb
|