|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#ifndef GFLAGS
|
|
|
|
#include <cstdio>
|
|
|
|
int main() {
|
|
|
|
fprintf(stderr, "Please install gflags to run rocksdb tools\n");
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
|
|
|
|
#include "db/db_impl/db_impl.h"
|
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "file/random_access_file_reader.h"
|
|
|
|
#include "monitoring/histogram.h"
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/file_system.h"
|
|
|
|
#include "rocksdb/slice_transform.h"
|
|
|
|
#include "rocksdb/system_clock.h"
|
|
|
|
#include "rocksdb/table.h"
|
|
|
|
#include "table/block_based/block_based_table_factory.h"
|
|
|
|
#include "table/get_context.h"
|
|
|
|
#include "table/internal_iterator.h"
|
|
|
|
#include "table/plain/plain_table_factory.h"
|
|
|
|
#include "table/table_builder.h"
|
|
|
|
#include "test_util/testharness.h"
|
|
|
|
#include "test_util/testutil.h"
|
|
|
|
#include "util/gflags_compat.h"
|
|
|
|
|
|
|
|
using GFLAGS_NAMESPACE::ParseCommandLineFlags;
|
|
|
|
using GFLAGS_NAMESPACE::SetUsageMessage;
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
// Make a key that i determines the first 4 characters and j determines the
|
|
|
|
// last 4 characters.
|
|
|
|
static std::string MakeKey(int i, int j, bool through_db) {
|
|
|
|
char buf[100];
|
|
|
|
snprintf(buf, sizeof(buf), "%04d__key___%04d", i, j);
|
|
|
|
if (through_db) {
|
|
|
|
return std::string(buf);
|
|
|
|
}
|
|
|
|
// If we directly query table, which operates on internal keys
|
|
|
|
// instead of user keys, we need to add 8 bytes of internal
|
|
|
|
// information (row type etc) to user key to make an internal
|
|
|
|
// key.
|
|
|
|
InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
|
|
|
|
return key.Encode().ToString();
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t Now(SystemClock* clock, bool measured_by_nanosecond) {
|
|
|
|
return measured_by_nanosecond ? clock->NowNanos() : clock->NowMicros();
|
Benchmark table reader wiht nanoseconds
Summary: nanosecnods gave us better view of the performance, especially when some operations are fast so that micro seconds may only reveal less informative results.
Test Plan:
sample output:
./table_reader_bench --plain_table --time_unit=nanosecond
=======================================================================================================
InMemoryTableSimpleBenchmark: PlainTable num_key1: 4096 num_key2: 512 non_empty
=======================================================================================================
Histogram (unit: nanosecond):
Count: 6291456 Average: 475.3867 StdDev: 556.05
Min: 135.0000 Median: 400.1817 Max: 33370.0000
Percentiles: P50: 400.18 P75: 530.02 P99: 887.73 P99.9: 8843.26 P99.99: 9941.21
------------------------------------------------------
[ 120, 140 ) 2 0.000% 0.000%
[ 140, 160 ) 452 0.007% 0.007%
[ 160, 180 ) 13683 0.217% 0.225%
[ 180, 200 ) 54353 0.864% 1.089%
[ 200, 250 ) 101004 1.605% 2.694%
[ 250, 300 ) 729791 11.600% 14.294% ##
[ 300, 350 ) 616070 9.792% 24.086% ##
[ 350, 400 ) 1628021 25.877% 49.963% #####
[ 400, 450 ) 647220 10.287% 60.250% ##
[ 450, 500 ) 577206 9.174% 69.424% ##
[ 500, 600 ) 1168585 18.574% 87.999% ####
[ 600, 700 ) 506875 8.057% 96.055% ##
[ 700, 800 ) 147878 2.350% 98.406%
[ 800, 900 ) 42633 0.678% 99.083%
[ 900, 1000 ) 16304 0.259% 99.342%
[ 1000, 1200 ) 7811 0.124% 99.466%
[ 1200, 1400 ) 1453 0.023% 99.490%
[ 1400, 1600 ) 307 0.005% 99.494%
[ 1600, 1800 ) 81 0.001% 99.496%
[ 1800, 2000 ) 18 0.000% 99.496%
[ 2000, 2500 ) 8 0.000% 99.496%
[ 2500, 3000 ) 6 0.000% 99.496%
[ 3500, 4000 ) 3 0.000% 99.496%
[ 4000, 4500 ) 116 0.002% 99.498%
[ 4500, 5000 ) 1144 0.018% 99.516%
[ 5000, 6000 ) 1087 0.017% 99.534%
[ 6000, 7000 ) 2403 0.038% 99.572%
[ 7000, 8000 ) 9840 0.156% 99.728%
[ 8000, 9000 ) 12820 0.204% 99.932%
[ 9000, 10000 ) 3881 0.062% 99.994%
[ 10000, 12000 ) 135 0.002% 99.996%
[ 12000, 14000 ) 159 0.003% 99.998%
[ 14000, 16000 ) 58 0.001% 99.999%
[ 16000, 18000 ) 30 0.000% 100.000%
[ 18000, 20000 ) 14 0.000% 100.000%
[ 20000, 25000 ) 2 0.000% 100.000%
[ 25000, 30000 ) 2 0.000% 100.000%
[ 30000, 35000 ) 1 0.000% 100.000%
Reviewers: haobo, dhruba, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D16113
11 years ago
|
|
|
}
|
|
|
|
} // namespace
|
Benchmark table reader wiht nanoseconds
Summary: nanosecnods gave us better view of the performance, especially when some operations are fast so that micro seconds may only reveal less informative results.
Test Plan:
sample output:
./table_reader_bench --plain_table --time_unit=nanosecond
=======================================================================================================
InMemoryTableSimpleBenchmark: PlainTable num_key1: 4096 num_key2: 512 non_empty
=======================================================================================================
Histogram (unit: nanosecond):
Count: 6291456 Average: 475.3867 StdDev: 556.05
Min: 135.0000 Median: 400.1817 Max: 33370.0000
Percentiles: P50: 400.18 P75: 530.02 P99: 887.73 P99.9: 8843.26 P99.99: 9941.21
------------------------------------------------------
[ 120, 140 ) 2 0.000% 0.000%
[ 140, 160 ) 452 0.007% 0.007%
[ 160, 180 ) 13683 0.217% 0.225%
[ 180, 200 ) 54353 0.864% 1.089%
[ 200, 250 ) 101004 1.605% 2.694%
[ 250, 300 ) 729791 11.600% 14.294% ##
[ 300, 350 ) 616070 9.792% 24.086% ##
[ 350, 400 ) 1628021 25.877% 49.963% #####
[ 400, 450 ) 647220 10.287% 60.250% ##
[ 450, 500 ) 577206 9.174% 69.424% ##
[ 500, 600 ) 1168585 18.574% 87.999% ####
[ 600, 700 ) 506875 8.057% 96.055% ##
[ 700, 800 ) 147878 2.350% 98.406%
[ 800, 900 ) 42633 0.678% 99.083%
[ 900, 1000 ) 16304 0.259% 99.342%
[ 1000, 1200 ) 7811 0.124% 99.466%
[ 1200, 1400 ) 1453 0.023% 99.490%
[ 1400, 1600 ) 307 0.005% 99.494%
[ 1600, 1800 ) 81 0.001% 99.496%
[ 1800, 2000 ) 18 0.000% 99.496%
[ 2000, 2500 ) 8 0.000% 99.496%
[ 2500, 3000 ) 6 0.000% 99.496%
[ 3500, 4000 ) 3 0.000% 99.496%
[ 4000, 4500 ) 116 0.002% 99.498%
[ 4500, 5000 ) 1144 0.018% 99.516%
[ 5000, 6000 ) 1087 0.017% 99.534%
[ 6000, 7000 ) 2403 0.038% 99.572%
[ 7000, 8000 ) 9840 0.156% 99.728%
[ 8000, 9000 ) 12820 0.204% 99.932%
[ 9000, 10000 ) 3881 0.062% 99.994%
[ 10000, 12000 ) 135 0.002% 99.996%
[ 12000, 14000 ) 159 0.003% 99.998%
[ 14000, 16000 ) 58 0.001% 99.999%
[ 16000, 18000 ) 30 0.000% 100.000%
[ 18000, 20000 ) 14 0.000% 100.000%
[ 20000, 25000 ) 2 0.000% 100.000%
[ 25000, 30000 ) 2 0.000% 100.000%
[ 30000, 35000 ) 1 0.000% 100.000%
Reviewers: haobo, dhruba, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D16113
11 years ago
|
|
|
|
|
|
|
// A very simple benchmark that.
|
|
|
|
// Create a table with roughly numKey1 * numKey2 keys,
|
|
|
|
// where there are numKey1 prefixes of the key, each has numKey2 number of
|
|
|
|
// distinguished key, differing in the suffix part.
|
|
|
|
// If if_query_empty_keys = false, query the existing keys numKey1 * numKey2
|
|
|
|
// times randomly.
|
|
|
|
// If if_query_empty_keys = true, query numKey1 * numKey2 random empty keys.
|
|
|
|
// Print out the total time.
|
|
|
|
// If through_db=true, a full DB will be created and queries will be against
|
|
|
|
// it. Otherwise, operations will be directly through table level.
|
|
|
|
//
|
|
|
|
// If for_terator=true, instead of just query one key each time, it queries
|
|
|
|
// a range sharing the same prefix.
|
|
|
|
namespace {
|
|
|
|
void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
|
|
|
|
ReadOptions& read_options, int num_keys1,
|
|
|
|
int num_keys2, int num_iter, int /*prefix_len*/,
|
|
|
|
bool if_query_empty_keys, bool for_iterator,
|
Benchmark table reader wiht nanoseconds
Summary: nanosecnods gave us better view of the performance, especially when some operations are fast so that micro seconds may only reveal less informative results.
Test Plan:
sample output:
./table_reader_bench --plain_table --time_unit=nanosecond
=======================================================================================================
InMemoryTableSimpleBenchmark: PlainTable num_key1: 4096 num_key2: 512 non_empty
=======================================================================================================
Histogram (unit: nanosecond):
Count: 6291456 Average: 475.3867 StdDev: 556.05
Min: 135.0000 Median: 400.1817 Max: 33370.0000
Percentiles: P50: 400.18 P75: 530.02 P99: 887.73 P99.9: 8843.26 P99.99: 9941.21
------------------------------------------------------
[ 120, 140 ) 2 0.000% 0.000%
[ 140, 160 ) 452 0.007% 0.007%
[ 160, 180 ) 13683 0.217% 0.225%
[ 180, 200 ) 54353 0.864% 1.089%
[ 200, 250 ) 101004 1.605% 2.694%
[ 250, 300 ) 729791 11.600% 14.294% ##
[ 300, 350 ) 616070 9.792% 24.086% ##
[ 350, 400 ) 1628021 25.877% 49.963% #####
[ 400, 450 ) 647220 10.287% 60.250% ##
[ 450, 500 ) 577206 9.174% 69.424% ##
[ 500, 600 ) 1168585 18.574% 87.999% ####
[ 600, 700 ) 506875 8.057% 96.055% ##
[ 700, 800 ) 147878 2.350% 98.406%
[ 800, 900 ) 42633 0.678% 99.083%
[ 900, 1000 ) 16304 0.259% 99.342%
[ 1000, 1200 ) 7811 0.124% 99.466%
[ 1200, 1400 ) 1453 0.023% 99.490%
[ 1400, 1600 ) 307 0.005% 99.494%
[ 1600, 1800 ) 81 0.001% 99.496%
[ 1800, 2000 ) 18 0.000% 99.496%
[ 2000, 2500 ) 8 0.000% 99.496%
[ 2500, 3000 ) 6 0.000% 99.496%
[ 3500, 4000 ) 3 0.000% 99.496%
[ 4000, 4500 ) 116 0.002% 99.498%
[ 4500, 5000 ) 1144 0.018% 99.516%
[ 5000, 6000 ) 1087 0.017% 99.534%
[ 6000, 7000 ) 2403 0.038% 99.572%
[ 7000, 8000 ) 9840 0.156% 99.728%
[ 8000, 9000 ) 12820 0.204% 99.932%
[ 9000, 10000 ) 3881 0.062% 99.994%
[ 10000, 12000 ) 135 0.002% 99.996%
[ 12000, 14000 ) 159 0.003% 99.998%
[ 14000, 16000 ) 58 0.001% 99.999%
[ 16000, 18000 ) 30 0.000% 100.000%
[ 18000, 20000 ) 14 0.000% 100.000%
[ 20000, 25000 ) 2 0.000% 100.000%
[ 25000, 30000 ) 2 0.000% 100.000%
[ 30000, 35000 ) 1 0.000% 100.000%
Reviewers: haobo, dhruba, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D16113
11 years ago
|
|
|
bool through_db, bool measured_by_nanosecond) {
|
|
|
|
ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
|
|
|
|
|
|
|
|
std::string file_name =
|
|
|
|
test::PerThreadDBPath("rocksdb_table_reader_benchmark");
|
|
|
|
std::string dbname = test::PerThreadDBPath("rocksdb_table_reader_bench_db");
|
|
|
|
WriteOptions wo;
|
|
|
|
Env* env = Env::Default();
|
|
|
|
auto* clock = env->GetSystemClock().get();
|
|
|
|
TableBuilder* tb = nullptr;
|
|
|
|
DB* db = nullptr;
|
|
|
|
Status s;
|
|
|
|
const ImmutableOptions ioptions(opts);
|
|
|
|
const ColumnFamilyOptions cfo(opts);
|
|
|
|
const MutableCFOptions moptions(cfo);
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer;
|
|
|
|
if (!through_db) {
|
|
|
|
ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), file_name,
|
|
|
|
FileOptions(env_options), &file_writer,
|
|
|
|
nullptr));
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
10 years ago
|
|
|
|
|
|
|
IntTblPropCollectorFactories int_tbl_prop_collector_factories;
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
10 years ago
|
|
|
|
|
|
|
int unknown_level = -1;
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
10 years ago
|
|
|
tb = opts.table_factory->NewTableBuilder(
|
|
|
|
TableBuilderOptions(
|
|
|
|
ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
|
|
|
|
CompressionType::kNoCompression, CompressionOptions(),
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
4 years ago
|
|
|
0 /* column_family_id */, kDefaultColumnFamilyName, unknown_level),
|
|
|
|
file_writer.get());
|
|
|
|
} else {
|
|
|
|
s = DB::Open(opts, dbname, &db);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(db != nullptr);
|
|
|
|
}
|
|
|
|
// Populate slightly more than 1M keys
|
|
|
|
for (int i = 0; i < num_keys1; i++) {
|
|
|
|
for (int j = 0; j < num_keys2; j++) {
|
|
|
|
std::string key = MakeKey(i * 2, j, through_db);
|
|
|
|
if (!through_db) {
|
|
|
|
tb->Add(key, key);
|
|
|
|
} else {
|
|
|
|
db->Put(wo, key, key);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!through_db) {
|
|
|
|
tb->Finish();
|
|
|
|
file_writer->Close();
|
|
|
|
} else {
|
|
|
|
db->Flush(FlushOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
std::unique_ptr<TableReader> table_reader;
|
|
|
|
if (!through_db) {
|
|
|
|
const auto& fs = env->GetFileSystem();
|
|
|
|
FileOptions fopts(env_options);
|
|
|
|
|
|
|
|
std::unique_ptr<FSRandomAccessFile> raf;
|
|
|
|
s = fs->NewRandomAccessFile(file_name, fopts, &raf, nullptr);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "Create File Error: %s\n", s.ToString().c_str());
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
uint64_t file_size;
|
|
|
|
fs->GetFileSize(file_name, fopts.io_options, &file_size, nullptr);
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader(
|
|
|
|
new RandomAccessFileReader(std::move(raf), file_name));
|
|
|
|
s = opts.table_factory->NewTableReader(
|
|
|
|
TableReaderOptions(ioptions, moptions.prefix_extractor, env_options,
|
|
|
|
ikc),
|
|
|
|
std::move(file_reader), file_size, &table_reader);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str());
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
std::string result;
|
|
|
|
HistogramImpl hist;
|
|
|
|
|
|
|
|
for (int it = 0; it < num_iter; it++) {
|
|
|
|
for (int i = 0; i < num_keys1; i++) {
|
|
|
|
for (int j = 0; j < num_keys2; j++) {
|
|
|
|
int r1 = rnd.Uniform(num_keys1) * 2;
|
|
|
|
int r2 = rnd.Uniform(num_keys2);
|
|
|
|
if (if_query_empty_keys) {
|
|
|
|
r1++;
|
|
|
|
r2 = num_keys2 * 2 - r2;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!for_iterator) {
|
|
|
|
// Query one existing key;
|
|
|
|
std::string key = MakeKey(r1, r2, through_db);
|
|
|
|
uint64_t start_time = Now(clock, measured_by_nanosecond);
|
|
|
|
if (!through_db) {
|
|
|
|
PinnableSlice value;
|
|
|
|
MergeContext merge_context;
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
6 years ago
|
|
|
SequenceNumber max_covering_tombstone_seq = 0;
|
|
|
|
GetContext get_context(
|
|
|
|
ioptions.user_comparator, ioptions.merge_operator.get(),
|
|
|
|
ioptions.logger, ioptions.stats, GetContext::kNotFound,
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2 years ago
|
|
|
Slice(key), &value, /*columns=*/nullptr, /*timestamp=*/nullptr,
|
|
|
|
&merge_context, true, &max_covering_tombstone_seq, clock);
|
|
|
|
s = table_reader->Get(read_options, key, &get_context, nullptr);
|
|
|
|
} else {
|
|
|
|
s = db->Get(read_options, key, &result);
|
|
|
|
}
|
|
|
|
hist.Add(Now(clock, measured_by_nanosecond) - start_time);
|
|
|
|
} else {
|
|
|
|
int r2_len;
|
|
|
|
if (if_query_empty_keys) {
|
|
|
|
r2_len = 0;
|
|
|
|
} else {
|
|
|
|
r2_len = rnd.Uniform(num_keys2) + 1;
|
|
|
|
if (r2_len + r2 > num_keys2) {
|
|
|
|
r2_len = num_keys2 - r2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
std::string start_key = MakeKey(r1, r2, through_db);
|
|
|
|
std::string end_key = MakeKey(r1, r2 + r2_len, through_db);
|
|
|
|
uint64_t total_time = 0;
|
|
|
|
uint64_t start_time = Now(clock, measured_by_nanosecond);
|
|
|
|
Iterator* iter = nullptr;
|
|
|
|
InternalIterator* iiter = nullptr;
|
|
|
|
if (!through_db) {
|
|
|
|
iiter = table_reader->NewIterator(
|
|
|
|
read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized);
|
|
|
|
} else {
|
|
|
|
iter = db->NewIterator(read_options);
|
|
|
|
}
|
|
|
|
int count = 0;
|
|
|
|
for (through_db ? iter->Seek(start_key) : iiter->Seek(start_key);
|
|
|
|
through_db ? iter->Valid() : iiter->Valid();
|
|
|
|
through_db ? iter->Next() : iiter->Next()) {
|
|
|
|
if (if_query_empty_keys) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// verify key;
|
|
|
|
total_time += Now(clock, measured_by_nanosecond) - start_time;
|
plain table reader: non-mmap mode to keep two recent buffers
Summary: In plain table reader's non-mmap mode, we only keep the most recent read buffer. However, for binary search, it is likely we come back to a location to read. To avoid one pread in such a case, we keep two read buffers. It should cover most of the cases.
Test Plan:
1. run tests
2. check the optimization works through strace when running
./table_reader_bench -mmap_read=false --num_keys2=1 -num_keys1=5000 -table_factory=plain_table --iterator --through_db
Reviewers: anthony, rven, kradhakrishnan, igor, yhchiang, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D51171
9 years ago
|
|
|
assert(Slice(MakeKey(r1, r2 + count, through_db)) ==
|
|
|
|
(through_db ? iter->key() : iiter->key()));
|
|
|
|
start_time = Now(clock, measured_by_nanosecond);
|
|
|
|
if (++count >= r2_len) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (count != r2_len) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Iterator cannot iterate expected number of entries. "
|
|
|
|
"Expected %d but got %d\n",
|
|
|
|
r2_len, count);
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
delete iter;
|
|
|
|
total_time += Now(clock, measured_by_nanosecond) - start_time;
|
|
|
|
hist.Add(total_time);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fprintf(
|
|
|
|
stderr,
|
|
|
|
"==================================================="
|
|
|
|
"====================================================\n"
|
|
|
|
"InMemoryTableSimpleBenchmark: %20s num_key1: %5d "
|
|
|
|
"num_key2: %5d %10s\n"
|
|
|
|
"==================================================="
|
|
|
|
"===================================================="
|
Benchmark table reader wiht nanoseconds
Summary: nanosecnods gave us better view of the performance, especially when some operations are fast so that micro seconds may only reveal less informative results.
Test Plan:
sample output:
./table_reader_bench --plain_table --time_unit=nanosecond
=======================================================================================================
InMemoryTableSimpleBenchmark: PlainTable num_key1: 4096 num_key2: 512 non_empty
=======================================================================================================
Histogram (unit: nanosecond):
Count: 6291456 Average: 475.3867 StdDev: 556.05
Min: 135.0000 Median: 400.1817 Max: 33370.0000
Percentiles: P50: 400.18 P75: 530.02 P99: 887.73 P99.9: 8843.26 P99.99: 9941.21
------------------------------------------------------
[ 120, 140 ) 2 0.000% 0.000%
[ 140, 160 ) 452 0.007% 0.007%
[ 160, 180 ) 13683 0.217% 0.225%
[ 180, 200 ) 54353 0.864% 1.089%
[ 200, 250 ) 101004 1.605% 2.694%
[ 250, 300 ) 729791 11.600% 14.294% ##
[ 300, 350 ) 616070 9.792% 24.086% ##
[ 350, 400 ) 1628021 25.877% 49.963% #####
[ 400, 450 ) 647220 10.287% 60.250% ##
[ 450, 500 ) 577206 9.174% 69.424% ##
[ 500, 600 ) 1168585 18.574% 87.999% ####
[ 600, 700 ) 506875 8.057% 96.055% ##
[ 700, 800 ) 147878 2.350% 98.406%
[ 800, 900 ) 42633 0.678% 99.083%
[ 900, 1000 ) 16304 0.259% 99.342%
[ 1000, 1200 ) 7811 0.124% 99.466%
[ 1200, 1400 ) 1453 0.023% 99.490%
[ 1400, 1600 ) 307 0.005% 99.494%
[ 1600, 1800 ) 81 0.001% 99.496%
[ 1800, 2000 ) 18 0.000% 99.496%
[ 2000, 2500 ) 8 0.000% 99.496%
[ 2500, 3000 ) 6 0.000% 99.496%
[ 3500, 4000 ) 3 0.000% 99.496%
[ 4000, 4500 ) 116 0.002% 99.498%
[ 4500, 5000 ) 1144 0.018% 99.516%
[ 5000, 6000 ) 1087 0.017% 99.534%
[ 6000, 7000 ) 2403 0.038% 99.572%
[ 7000, 8000 ) 9840 0.156% 99.728%
[ 8000, 9000 ) 12820 0.204% 99.932%
[ 9000, 10000 ) 3881 0.062% 99.994%
[ 10000, 12000 ) 135 0.002% 99.996%
[ 12000, 14000 ) 159 0.003% 99.998%
[ 14000, 16000 ) 58 0.001% 99.999%
[ 16000, 18000 ) 30 0.000% 100.000%
[ 18000, 20000 ) 14 0.000% 100.000%
[ 20000, 25000 ) 2 0.000% 100.000%
[ 25000, 30000 ) 2 0.000% 100.000%
[ 30000, 35000 ) 1 0.000% 100.000%
Reviewers: haobo, dhruba, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D16113
11 years ago
|
|
|
"\nHistogram (unit: %s): \n%s",
|
|
|
|
opts.table_factory->Name(), num_keys1, num_keys2,
|
Benchmark table reader wiht nanoseconds
Summary: nanosecnods gave us better view of the performance, especially when some operations are fast so that micro seconds may only reveal less informative results.
Test Plan:
sample output:
./table_reader_bench --plain_table --time_unit=nanosecond
=======================================================================================================
InMemoryTableSimpleBenchmark: PlainTable num_key1: 4096 num_key2: 512 non_empty
=======================================================================================================
Histogram (unit: nanosecond):
Count: 6291456 Average: 475.3867 StdDev: 556.05
Min: 135.0000 Median: 400.1817 Max: 33370.0000
Percentiles: P50: 400.18 P75: 530.02 P99: 887.73 P99.9: 8843.26 P99.99: 9941.21
------------------------------------------------------
[ 120, 140 ) 2 0.000% 0.000%
[ 140, 160 ) 452 0.007% 0.007%
[ 160, 180 ) 13683 0.217% 0.225%
[ 180, 200 ) 54353 0.864% 1.089%
[ 200, 250 ) 101004 1.605% 2.694%
[ 250, 300 ) 729791 11.600% 14.294% ##
[ 300, 350 ) 616070 9.792% 24.086% ##
[ 350, 400 ) 1628021 25.877% 49.963% #####
[ 400, 450 ) 647220 10.287% 60.250% ##
[ 450, 500 ) 577206 9.174% 69.424% ##
[ 500, 600 ) 1168585 18.574% 87.999% ####
[ 600, 700 ) 506875 8.057% 96.055% ##
[ 700, 800 ) 147878 2.350% 98.406%
[ 800, 900 ) 42633 0.678% 99.083%
[ 900, 1000 ) 16304 0.259% 99.342%
[ 1000, 1200 ) 7811 0.124% 99.466%
[ 1200, 1400 ) 1453 0.023% 99.490%
[ 1400, 1600 ) 307 0.005% 99.494%
[ 1600, 1800 ) 81 0.001% 99.496%
[ 1800, 2000 ) 18 0.000% 99.496%
[ 2000, 2500 ) 8 0.000% 99.496%
[ 2500, 3000 ) 6 0.000% 99.496%
[ 3500, 4000 ) 3 0.000% 99.496%
[ 4000, 4500 ) 116 0.002% 99.498%
[ 4500, 5000 ) 1144 0.018% 99.516%
[ 5000, 6000 ) 1087 0.017% 99.534%
[ 6000, 7000 ) 2403 0.038% 99.572%
[ 7000, 8000 ) 9840 0.156% 99.728%
[ 8000, 9000 ) 12820 0.204% 99.932%
[ 9000, 10000 ) 3881 0.062% 99.994%
[ 10000, 12000 ) 135 0.002% 99.996%
[ 12000, 14000 ) 159 0.003% 99.998%
[ 14000, 16000 ) 58 0.001% 99.999%
[ 16000, 18000 ) 30 0.000% 100.000%
[ 18000, 20000 ) 14 0.000% 100.000%
[ 20000, 25000 ) 2 0.000% 100.000%
[ 25000, 30000 ) 2 0.000% 100.000%
[ 30000, 35000 ) 1 0.000% 100.000%
Reviewers: haobo, dhruba, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D16113
11 years ago
|
|
|
for_iterator ? "iterator" : (if_query_empty_keys ? "empty" : "non_empty"),
|
|
|
|
measured_by_nanosecond ? "nanosecond" : "microsecond",
|
|
|
|
hist.ToString().c_str());
|
|
|
|
if (!through_db) {
|
|
|
|
env->DeleteFile(file_name);
|
|
|
|
} else {
|
|
|
|
delete db;
|
|
|
|
db = nullptr;
|
|
|
|
DestroyDB(dbname, opts);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
DEFINE_bool(query_empty, false,
|
|
|
|
"query non-existing keys instead of existing ones.");
|
|
|
|
DEFINE_int32(num_keys1, 4096, "number of distinguish prefix of keys");
|
|
|
|
DEFINE_int32(num_keys2, 512, "number of distinguish keys for each prefix");
|
|
|
|
DEFINE_int32(iter, 3, "query non-existing keys instead of existing ones");
|
|
|
|
DEFINE_int32(prefix_len, 16, "Prefix length used for iterators and indexes");
|
|
|
|
DEFINE_bool(iterator, false, "For test iterator");
|
|
|
|
DEFINE_bool(through_db, false,
|
|
|
|
"If enable, a DB instance will be created and the query will be "
|
|
|
|
"against DB. Otherwise, will be directly against a table reader.");
|
|
|
|
DEFINE_bool(mmap_read, true, "Whether use mmap read");
|
|
|
|
DEFINE_string(table_factory, "block_based",
|
|
|
|
"Table factory to use: `block_based` (default), `plain_table` or "
|
|
|
|
"`cuckoo_hash`.");
|
Benchmark table reader wiht nanoseconds
Summary: nanosecnods gave us better view of the performance, especially when some operations are fast so that micro seconds may only reveal less informative results.
Test Plan:
sample output:
./table_reader_bench --plain_table --time_unit=nanosecond
=======================================================================================================
InMemoryTableSimpleBenchmark: PlainTable num_key1: 4096 num_key2: 512 non_empty
=======================================================================================================
Histogram (unit: nanosecond):
Count: 6291456 Average: 475.3867 StdDev: 556.05
Min: 135.0000 Median: 400.1817 Max: 33370.0000
Percentiles: P50: 400.18 P75: 530.02 P99: 887.73 P99.9: 8843.26 P99.99: 9941.21
------------------------------------------------------
[ 120, 140 ) 2 0.000% 0.000%
[ 140, 160 ) 452 0.007% 0.007%
[ 160, 180 ) 13683 0.217% 0.225%
[ 180, 200 ) 54353 0.864% 1.089%
[ 200, 250 ) 101004 1.605% 2.694%
[ 250, 300 ) 729791 11.600% 14.294% ##
[ 300, 350 ) 616070 9.792% 24.086% ##
[ 350, 400 ) 1628021 25.877% 49.963% #####
[ 400, 450 ) 647220 10.287% 60.250% ##
[ 450, 500 ) 577206 9.174% 69.424% ##
[ 500, 600 ) 1168585 18.574% 87.999% ####
[ 600, 700 ) 506875 8.057% 96.055% ##
[ 700, 800 ) 147878 2.350% 98.406%
[ 800, 900 ) 42633 0.678% 99.083%
[ 900, 1000 ) 16304 0.259% 99.342%
[ 1000, 1200 ) 7811 0.124% 99.466%
[ 1200, 1400 ) 1453 0.023% 99.490%
[ 1400, 1600 ) 307 0.005% 99.494%
[ 1600, 1800 ) 81 0.001% 99.496%
[ 1800, 2000 ) 18 0.000% 99.496%
[ 2000, 2500 ) 8 0.000% 99.496%
[ 2500, 3000 ) 6 0.000% 99.496%
[ 3500, 4000 ) 3 0.000% 99.496%
[ 4000, 4500 ) 116 0.002% 99.498%
[ 4500, 5000 ) 1144 0.018% 99.516%
[ 5000, 6000 ) 1087 0.017% 99.534%
[ 6000, 7000 ) 2403 0.038% 99.572%
[ 7000, 8000 ) 9840 0.156% 99.728%
[ 8000, 9000 ) 12820 0.204% 99.932%
[ 9000, 10000 ) 3881 0.062% 99.994%
[ 10000, 12000 ) 135 0.002% 99.996%
[ 12000, 14000 ) 159 0.003% 99.998%
[ 14000, 16000 ) 58 0.001% 99.999%
[ 16000, 18000 ) 30 0.000% 100.000%
[ 18000, 20000 ) 14 0.000% 100.000%
[ 20000, 25000 ) 2 0.000% 100.000%
[ 25000, 30000 ) 2 0.000% 100.000%
[ 30000, 35000 ) 1 0.000% 100.000%
Reviewers: haobo, dhruba, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D16113
11 years ago
|
|
|
DEFINE_string(time_unit, "microsecond",
|
|
|
|
"The time unit used for measuring performance. User can specify "
|
|
|
|
"`microsecond` (default) or `nanosecond`");
|
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
|
|
|
SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
|
|
|
|
" [OPTIONS]...");
|
|
|
|
ParseCommandLineFlags(&argc, &argv, true);
|
|
|
|
|
|
|
|
std::shared_ptr<ROCKSDB_NAMESPACE::TableFactory> tf;
|
|
|
|
ROCKSDB_NAMESPACE::Options options;
|
|
|
|
if (FLAGS_prefix_len < 16) {
|
|
|
|
options.prefix_extractor.reset(
|
|
|
|
ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_len));
|
|
|
|
}
|
|
|
|
ROCKSDB_NAMESPACE::ReadOptions ro;
|
|
|
|
ROCKSDB_NAMESPACE::EnvOptions env_options;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.compression = ROCKSDB_NAMESPACE::CompressionType::kNoCompression;
|
|
|
|
|
|
|
|
if (FLAGS_table_factory == "cuckoo_hash") {
|
|
|
|
options.allow_mmap_reads = FLAGS_mmap_read;
|
|
|
|
env_options.use_mmap_reads = FLAGS_mmap_read;
|
|
|
|
ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
|
CuckooTable: add one option to allow identity function for the first hash function
Summary:
MurmurHash becomes expensive when we do millions Get() a second in one
thread. Add this option to allow the first hash function to use identity
function as hash function. It results in QPS increase from 3.7M/s to
~4.3M/s. I did not observe improvement for end to end RocksDB
performance. This may be caused by other bottlenecks that I will address
in a separate diff.
Test Plan:
```
[ljin@dev1964 rocksdb] ./cuckoo_table_reader_test --enable_perf --file_dir=/dev/shm --write --identity_as_first_hash=0
==== Test CuckooReaderTest.WhenKeyExists
==== Test CuckooReaderTest.WhenKeyExistsWithUint64Comparator
==== Test CuckooReaderTest.CheckIterator
==== Test CuckooReaderTest.CheckIteratorUint64
==== Test CuckooReaderTest.WhenKeyNotFound
==== Test CuckooReaderTest.TestReadPerformance
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.272us (3.7 Mqps) with batch size of 0, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.138us (7.2 Mqps) with batch size of 10, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.142us (7.1 Mqps) with batch size of 25, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.142us (7.0 Mqps) with batch size of 50, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.144us (6.9 Mqps) with batch size of 100, # of found keys 125829120
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.201us (5.0 Mqps) with batch size of 0, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.121us (8.3 Mqps) with batch size of 10, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.123us (8.1 Mqps) with batch size of 25, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.121us (8.3 Mqps) with batch size of 50, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.112us (8.9 Mqps) with batch size of 100, # of found keys 104857600
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.251us (4.0 Mqps) with batch size of 0, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.107us (9.4 Mqps) with batch size of 10, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.099us (10.1 Mqps) with batch size of 25, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.100us (10.0 Mqps) with batch size of 50, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.116us (8.6 Mqps) with batch size of 100, # of found keys 83886080
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.189us (5.3 Mqps) with batch size of 0, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.095us (10.5 Mqps) with batch size of 10, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.096us (10.4 Mqps) with batch size of 25, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.098us (10.2 Mqps) with batch size of 50, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.105us (9.5 Mqps) with batch size of 100, # of found keys 73400320
[ljin@dev1964 rocksdb] ./cuckoo_table_reader_test --enable_perf --file_dir=/dev/shm --write --identity_as_first_hash=1
==== Test CuckooReaderTest.WhenKeyExists
==== Test CuckooReaderTest.WhenKeyExistsWithUint64Comparator
==== Test CuckooReaderTest.CheckIterator
==== Test CuckooReaderTest.CheckIteratorUint64
==== Test CuckooReaderTest.WhenKeyNotFound
==== Test CuckooReaderTest.TestReadPerformance
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.230us (4.3 Mqps) with batch size of 0, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.086us (11.7 Mqps) with batch size of 10, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.088us (11.3 Mqps) with batch size of 25, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.083us (12.1 Mqps) with batch size of 50, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.083us (12.1 Mqps) with batch size of 100, # of found keys 125829120
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.159us (6.3 Mqps) with batch size of 0, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.078us (12.8 Mqps) with batch size of 10, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.080us (12.6 Mqps) with batch size of 25, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.080us (12.5 Mqps) with batch size of 50, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.082us (12.2 Mqps) with batch size of 100, # of found keys 104857600
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.154us (6.5 Mqps) with batch size of 0, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.077us (13.0 Mqps) with batch size of 10, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.077us (12.9 Mqps) with batch size of 25, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.078us (12.8 Mqps) with batch size of 50, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.079us (12.6 Mqps) with batch size of 100, # of found keys 83886080
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.218us (4.6 Mqps) with batch size of 0, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.083us (12.0 Mqps) with batch size of 10, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.085us (11.7 Mqps) with batch size of 25, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.086us (11.6 Mqps) with batch size of 50, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.078us (12.8 Mqps) with batch size of 100, # of found keys 73400320
```
Reviewers: sdong, igor, yhchiang
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D23451
11 years ago
|
|
|
table_options.hash_table_ratio = 0.75;
|
|
|
|
tf.reset(ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options));
|
|
|
|
} else if (FLAGS_table_factory == "plain_table") {
|
|
|
|
options.allow_mmap_reads = FLAGS_mmap_read;
|
|
|
|
env_options.use_mmap_reads = FLAGS_mmap_read;
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::PlainTableOptions plain_table_options;
|
|
|
|
plain_table_options.user_key_len = 16;
|
|
|
|
plain_table_options.bloom_bits_per_key = (FLAGS_prefix_len == 16) ? 0 : 8;
|
|
|
|
plain_table_options.hash_table_ratio = 0.75;
|
|
|
|
|
|
|
|
tf.reset(new ROCKSDB_NAMESPACE::PlainTableFactory(plain_table_options));
|
|
|
|
options.prefix_extractor.reset(
|
|
|
|
ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_len));
|
|
|
|
} else if (FLAGS_table_factory == "block_based") {
|
|
|
|
tf.reset(new ROCKSDB_NAMESPACE::BlockBasedTableFactory());
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "Invalid table type %s\n", FLAGS_table_factory.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (tf) {
|
|
|
|
// if user provides invalid options, just fall back to microsecond.
|
|
|
|
bool measured_by_nanosecond = FLAGS_time_unit == "nanosecond";
|
|
|
|
|
|
|
|
options.table_factory = tf;
|
|
|
|
ROCKSDB_NAMESPACE::TableReaderBenchmark(
|
|
|
|
options, env_options, ro, FLAGS_num_keys1, FLAGS_num_keys2, FLAGS_iter,
|
|
|
|
FLAGS_prefix_len, FLAGS_query_empty, FLAGS_iterator, FLAGS_through_db,
|
|
|
|
measured_by_nanosecond);
|
|
|
|
} else {
|
|
|
|
return 1;
|
|
|
|
}
|
Benchmark table reader wiht nanoseconds
Summary: nanosecnods gave us better view of the performance, especially when some operations are fast so that micro seconds may only reveal less informative results.
Test Plan:
sample output:
./table_reader_bench --plain_table --time_unit=nanosecond
=======================================================================================================
InMemoryTableSimpleBenchmark: PlainTable num_key1: 4096 num_key2: 512 non_empty
=======================================================================================================
Histogram (unit: nanosecond):
Count: 6291456 Average: 475.3867 StdDev: 556.05
Min: 135.0000 Median: 400.1817 Max: 33370.0000
Percentiles: P50: 400.18 P75: 530.02 P99: 887.73 P99.9: 8843.26 P99.99: 9941.21
------------------------------------------------------
[ 120, 140 ) 2 0.000% 0.000%
[ 140, 160 ) 452 0.007% 0.007%
[ 160, 180 ) 13683 0.217% 0.225%
[ 180, 200 ) 54353 0.864% 1.089%
[ 200, 250 ) 101004 1.605% 2.694%
[ 250, 300 ) 729791 11.600% 14.294% ##
[ 300, 350 ) 616070 9.792% 24.086% ##
[ 350, 400 ) 1628021 25.877% 49.963% #####
[ 400, 450 ) 647220 10.287% 60.250% ##
[ 450, 500 ) 577206 9.174% 69.424% ##
[ 500, 600 ) 1168585 18.574% 87.999% ####
[ 600, 700 ) 506875 8.057% 96.055% ##
[ 700, 800 ) 147878 2.350% 98.406%
[ 800, 900 ) 42633 0.678% 99.083%
[ 900, 1000 ) 16304 0.259% 99.342%
[ 1000, 1200 ) 7811 0.124% 99.466%
[ 1200, 1400 ) 1453 0.023% 99.490%
[ 1400, 1600 ) 307 0.005% 99.494%
[ 1600, 1800 ) 81 0.001% 99.496%
[ 1800, 2000 ) 18 0.000% 99.496%
[ 2000, 2500 ) 8 0.000% 99.496%
[ 2500, 3000 ) 6 0.000% 99.496%
[ 3500, 4000 ) 3 0.000% 99.496%
[ 4000, 4500 ) 116 0.002% 99.498%
[ 4500, 5000 ) 1144 0.018% 99.516%
[ 5000, 6000 ) 1087 0.017% 99.534%
[ 6000, 7000 ) 2403 0.038% 99.572%
[ 7000, 8000 ) 9840 0.156% 99.728%
[ 8000, 9000 ) 12820 0.204% 99.932%
[ 9000, 10000 ) 3881 0.062% 99.994%
[ 10000, 12000 ) 135 0.002% 99.996%
[ 12000, 14000 ) 159 0.003% 99.998%
[ 14000, 16000 ) 58 0.001% 99.999%
[ 16000, 18000 ) 30 0.000% 100.000%
[ 18000, 20000 ) 14 0.000% 100.000%
[ 20000, 25000 ) 2 0.000% 100.000%
[ 25000, 30000 ) 2 0.000% 100.000%
[ 30000, 35000 ) 1 0.000% 100.000%
Reviewers: haobo, dhruba, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D16113
11 years ago
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif // GFLAGS
|