|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/db_test_util.h"
|
|
|
|
|
|
|
|
#include "cache/cache_reservation_manager.h"
|
|
|
|
#include "db/forward_iterator.h"
|
|
|
|
#include "env/mock_env.h"
|
Eliminate unnecessary (slow) block cache Ref()ing in MultiGet (#9899)
Summary:
When MultiGet() determines that multiple query keys can be
served by examining the same data block in block cache (one Lookup()),
each PinnableSlice referring to data in that data block needs to hold
on to the block in cache so that they can be released at arbitrary
times by the API user. Historically this is accomplished with extra
calls to Ref() on the Handle from Lookup(), with each PinnableSlice
cleanup calling Release() on the Handle, but this creates extra
contention on the block cache for the extra Ref()s and Release()es,
especially because they hit the same cache shard repeatedly.
In the case of merge operands (possibly more cases?), the problem was
compounded by doing an extra Ref()+eventual Release() for each merge
operand for a key reusing a block (which could be the same key!), rather
than one Ref() per key. (Note: the non-shared case with `biter` was
already one per key.)
This change optimizes MultiGet not to rely on these extra, contentious
Ref()+Release() calls by instead, in the shared block case, wrapping
the cache Release() cleanup in a refcounted object referenced by the
PinnableSlices, such that after the last wrapped reference is released,
the cache entry is Release()ed. Relaxed atomic refcounts should be
much faster than mutex-guarded Ref() and Release(), and much less prone
to a performance cliff when MultiGet() does a lot of block sharing.
Note that I did not use std::shared_ptr, because that would require an
extra indirection object (shared_ptr itself new/delete) in order to
associate a ref increment/decrement with a Cleanable cleanup entry. (If
I assumed it was the size of two pointers, I could do some hackery to
make it work without the extra indirection, but that's too fragile.)
Some details:
* Fixed (removed) extra block cache tracing entries in cases of cache
entry reuse in MultiGet, but it's likely that in some other cases traces
are missing (XXX comment inserted)
* Moved existing implementations for cleanable.h from iterator.cc to
new cleanable.cc
* Improved API comments on Cleanable
* Added a public SharedCleanablePtr class to cleanable.h in case others
could benefit from the same pattern (potentially many Cleanables and/or
smart pointers referencing a shared Cleanable)
* Add a typedef for MultiGetContext::Mask
* Some variable renaming for clarity
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9899
Test Plan:
Added unit tests for SharedCleanablePtr.
Greatly enhanced ability of existing tests to detect cache use-after-free.
* Release PinnableSlices from MultiGet as they are read rather than in
bulk (in db_test_util wrapper).
* In ASAN build, default to using a trivially small LRUCache for block_cache
so that entries are immediately erased when unreferenced. (Updated two
tests that depend on caching.) New ASAN testsuite running time seems
OK to me.
If I introduce a bug into my implementation where we skip the shared
cleanups on block reuse, ASAN detects the bug in
`db_basic_test *MultiGet*`. If I remove either of the above testing
enhancements, the bug is not detected.
Consider for follow-up work: manipulate or randomize ordering of
PinnableSlice use and release from MultiGet db_test_util wrapper. But in
typical cases, natural ordering gives pretty good functional coverage.
Performance test:
In the extreme (but possible) case of MultiGetting the same or adjacent keys
in a batch, throughput can improve by an order of magnitude.
`./db_bench -benchmarks=multireadrandom -db=/dev/shm/testdb -readonly -num=5 -duration=10 -threads=20 -multiread_batched -batch_size=200`
Before ops/sec, num=5: 1,384,394
Before ops/sec, num=500: 6,423,720
After ops/sec, num=500: 10,658,794
After ops/sec, num=5: 16,027,257
Also note that previously, with high parallelism, having query keys
concentrated in a single block was worse than spreading them out a bit. Now
concentrated in a single block is faster than spread out, which is hopefully
consistent with natural expectation.
Random query performance: with num=1000000, over 999 x 10s runs running before & after simultaneously (each -threads=12):
Before: multireadrandom [AVG 999 runs] : 1088699 (± 7344) ops/sec; 120.4 (± 0.8 ) MB/sec
After: multireadrandom [AVG 999 runs] : 1090402 (± 7230) ops/sec; 120.6 (± 0.8 ) MB/sec
Possibly better, possibly in the noise.
Reviewed By: anand1976
Differential Revision: D35907003
Pulled By: pdillinger
fbshipit-source-id: bbd244d703649a8ca12d476f2d03853ed9d1a17e
3 years ago
|
|
|
#include "port/lang.h"
|
|
|
|
#include "rocksdb/cache.h"
|
|
|
|
#include "rocksdb/convenience.h"
|
|
|
|
#include "rocksdb/env_encryption.h"
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
3 years ago
|
|
|
#include "rocksdb/unique_id.h"
|
|
|
|
#include "rocksdb/utilities/object_registry.h"
|
|
|
|
#include "table/format.h"
|
|
|
|
#include "util/random.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
int64_t MaybeCurrentTime(Env* env) {
|
|
|
|
int64_t time = 1337346000; // arbitrary fallback default
|
|
|
|
env->GetCurrentTime(&time).PermitUncheckedError();
|
|
|
|
return time;
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
// Special Env used to delay background operations
|
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
SpecialEnv::SpecialEnv(Env* base, bool time_elapse_only_sleep)
|
|
|
|
: EnvWrapper(base),
|
|
|
|
maybe_starting_time_(MaybeCurrentTime(base)),
|
|
|
|
rnd_(301),
|
|
|
|
sleep_counter_(this),
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
time_elapse_only_sleep_(time_elapse_only_sleep),
|
|
|
|
no_slowdown_(time_elapse_only_sleep) {
|
|
|
|
delay_sstable_sync_.store(false, std::memory_order_release);
|
|
|
|
drop_writes_.store(false, std::memory_order_release);
|
|
|
|
no_space_.store(false, std::memory_order_release);
|
|
|
|
non_writable_.store(false, std::memory_order_release);
|
|
|
|
count_random_reads_ = false;
|
|
|
|
count_sequential_reads_ = false;
|
|
|
|
manifest_sync_error_.store(false, std::memory_order_release);
|
|
|
|
manifest_write_error_.store(false, std::memory_order_release);
|
|
|
|
log_write_error_.store(false, std::memory_order_release);
|
Handle rename() failure in non-local FS (#8192)
Summary:
In a distributed environment, a file `rename()` operation can succeed on server (remote)
side, but the client can somehow return non-ok status to RocksDB. Possible reasons include
network partition, connection issue, etc. This happens in `rocksdb::SetCurrentFile()`, which
can be called in `LogAndApply() -> ProcessManifestWrites()` if RocksDB tries to switch to a
new MANIFEST. We currently always delete the new MANIFEST if an error occurs.
This is problematic in distributed world. If the server-side successfully updates the CURRENT
file via renaming, then a subsequent `DB::Open()` will try to look for the new MANIFEST and fail.
As a fix, we can track the execution result of IO operations on the new MANIFEST.
- If IO operations on the new MANIFEST fail, then we know the CURRENT must point to the original
MANIFEST. Therefore, it is safe to remove the new MANIFEST.
- If IO operations on the new MANIFEST all succeed, but somehow we end up in the clean up
code block, then we do not know whether CURRENT points to the new or old MANIFEST. (For local
POSIX-compliant FS, it should still point to old MANIFEST, but it does not matter if we keep the
new MANIFEST.) Therefore, we keep the new MANIFEST.
- Any future `LogAndApply()` will switch to a new MANIFEST and update CURRENT.
- If process reopens the db immediately after the failure, then the CURRENT file can point
to either the new MANIFEST or the old one, both of which exist. Therefore, recovery can
succeed and ignore the other.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8192
Test Plan: make check
Reviewed By: zhichao-cao
Differential Revision: D27804648
Pulled By: riversand963
fbshipit-source-id: 9c16f2a5ce41bc6aadf085e48449b19ede8423e4
4 years ago
|
|
|
no_file_overwrite_.store(false, std::memory_order_release);
|
|
|
|
random_file_open_counter_.store(0, std::memory_order_relaxed);
|
|
|
|
delete_count_.store(0, std::memory_order_relaxed);
|
|
|
|
num_open_wal_file_.store(0);
|
|
|
|
log_write_slowdown_ = 0;
|
|
|
|
bytes_written_ = 0;
|
|
|
|
sync_counter_ = 0;
|
|
|
|
non_writeable_rate_ = 0;
|
|
|
|
new_writable_count_ = 0;
|
|
|
|
non_writable_count_ = 0;
|
|
|
|
table_write_callback_ = nullptr;
|
|
|
|
}
|
|
|
|
DBTestBase::DBTestBase(const std::string path, bool env_do_fsync)
|
|
|
|
: mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) {
|
|
|
|
Env* base_env = Env::Default();
|
|
|
|
ConfigOptions config_options;
|
|
|
|
EXPECT_OK(test::CreateEnvFromSystem(config_options, &base_env, &env_guard_));
|
|
|
|
EXPECT_NE(nullptr, base_env);
|
|
|
|
if (getenv("MEM_ENV")) {
|
|
|
|
mem_env_ = MockEnv::Create(base_env, base_env->GetSystemClock());
|
|
|
|
}
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (getenv("ENCRYPTED_ENV")) {
|
|
|
|
std::shared_ptr<EncryptionProvider> provider;
|
|
|
|
std::string provider_id = getenv("ENCRYPTED_ENV");
|
|
|
|
if (provider_id.find("=") == std::string::npos &&
|
|
|
|
!EndsWith(provider_id, "://test")) {
|
|
|
|
provider_id = provider_id + "://test";
|
|
|
|
}
|
|
|
|
EXPECT_OK(EncryptionProvider::CreateFromString(ConfigOptions(), provider_id,
|
|
|
|
&provider));
|
|
|
|
encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env, provider);
|
|
|
|
}
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_
|
|
|
|
: (mem_env_ ? mem_env_ : base_env));
|
|
|
|
env_->SetBackgroundThreads(1, Env::LOW);
|
|
|
|
env_->SetBackgroundThreads(1, Env::HIGH);
|
|
|
|
env_->skip_fsync_ = !env_do_fsync;
|
|
|
|
dbname_ = test::PerThreadDBPath(env_, path);
|
|
|
|
alternative_wal_dir_ = dbname_ + "/wal";
|
|
|
|
alternative_db_log_dir_ = dbname_ + "/db_log_dir";
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
auto delete_options = options;
|
|
|
|
delete_options.wal_dir = alternative_wal_dir_;
|
|
|
|
EXPECT_OK(DestroyDB(dbname_, delete_options));
|
|
|
|
// Destroy it for not alternative WAL dir is used.
|
|
|
|
EXPECT_OK(DestroyDB(dbname_, options));
|
|
|
|
db_ = nullptr;
|
|
|
|
Reopen(options);
|
|
|
|
Random::GetTLSInstance()->Reset(0xdeadbeef);
|
|
|
|
}
|
|
|
|
|
|
|
|
DBTestBase::~DBTestBase() {
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
Close();
|
|
|
|
Options options;
|
|
|
|
options.db_paths.emplace_back(dbname_, 0);
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_2", 0);
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_3", 0);
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_4", 0);
|
|
|
|
options.env = env_;
|
|
|
|
|
|
|
|
if (getenv("KEEP_DB")) {
|
|
|
|
printf("DB is still at %s\n", dbname_.c_str());
|
|
|
|
} else {
|
|
|
|
EXPECT_OK(DestroyDB(dbname_, options));
|
|
|
|
}
|
|
|
|
delete env_;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) {
|
|
|
|
#ifdef ROCKSDB_LITE
|
|
|
|
// These options are not supported in ROCKSDB_LITE
|
|
|
|
if (option_config == kHashSkipList ||
|
|
|
|
option_config == kPlainTableFirstBytePrefix ||
|
|
|
|
option_config == kPlainTableCappedPrefix ||
|
|
|
|
option_config == kPlainTableCappedPrefixNonMmap ||
|
|
|
|
option_config == kPlainTableAllBytesPrefix ||
|
|
|
|
option_config == kVectorRep || option_config == kHashLinkList ||
|
|
|
|
option_config == kUniversalCompaction ||
|
|
|
|
option_config == kUniversalCompactionMultiLevel ||
|
|
|
|
option_config == kUniversalSubcompactions ||
|
|
|
|
option_config == kFIFOCompaction ||
|
|
|
|
option_config == kConcurrentSkipList) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if ((skip_mask & kSkipUniversalCompaction) &&
|
|
|
|
(option_config == kUniversalCompaction ||
|
|
|
|
option_config == kUniversalCompactionMultiLevel ||
|
|
|
|
option_config == kUniversalSubcompactions)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if ((skip_mask & kSkipMergePut) && option_config == kMergePut) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if ((skip_mask & kSkipNoSeekToLast) &&
|
|
|
|
(option_config == kHashLinkList || option_config == kHashSkipList)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if ((skip_mask & kSkipPlainTable) &&
|
|
|
|
(option_config == kPlainTableAllBytesPrefix ||
|
|
|
|
option_config == kPlainTableFirstBytePrefix ||
|
|
|
|
option_config == kPlainTableCappedPrefix ||
|
|
|
|
option_config == kPlainTableCappedPrefixNonMmap)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if ((skip_mask & kSkipHashIndex) &&
|
|
|
|
(option_config == kBlockBasedTableWithPrefixHashIndex ||
|
|
|
|
option_config == kBlockBasedTableWithWholeKeyHashIndex)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if ((skip_mask & kSkipFIFOCompaction) && option_config == kFIFOCompaction) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if ((skip_mask & kSkipMmapReads) && option_config == kWalDirAndMmapReads) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Switch to a fresh database with the next option configuration to
|
|
|
|
// test. Return false if there are no more configurations to test.
|
|
|
|
bool DBTestBase::ChangeOptions(int skip_mask) {
|
|
|
|
for (option_config_++; option_config_ < kEnd; option_config_++) {
|
|
|
|
if (ShouldSkipOptions(option_config_, skip_mask)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (option_config_ >= kEnd) {
|
|
|
|
Destroy(last_options_);
|
|
|
|
return false;
|
|
|
|
} else {
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Switch between different compaction styles.
|
|
|
|
bool DBTestBase::ChangeCompactOptions() {
|
|
|
|
if (option_config_ == kDefault) {
|
|
|
|
option_config_ = kUniversalCompaction;
|
|
|
|
Destroy(last_options_);
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
Reopen(options);
|
|
|
|
return true;
|
|
|
|
} else if (option_config_ == kUniversalCompaction) {
|
|
|
|
option_config_ = kUniversalCompactionMultiLevel;
|
|
|
|
Destroy(last_options_);
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
Reopen(options);
|
|
|
|
return true;
|
|
|
|
} else if (option_config_ == kUniversalCompactionMultiLevel) {
|
|
|
|
option_config_ = kLevelSubcompactions;
|
|
|
|
Destroy(last_options_);
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
assert(options.max_subcompactions > 1);
|
|
|
|
Reopen(options);
|
|
|
|
return true;
|
|
|
|
} else if (option_config_ == kLevelSubcompactions) {
|
|
|
|
option_config_ = kUniversalSubcompactions;
|
|
|
|
Destroy(last_options_);
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
assert(options.max_subcompactions > 1);
|
|
|
|
Reopen(options);
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Switch between different WAL settings
|
|
|
|
bool DBTestBase::ChangeWalOptions() {
|
|
|
|
if (option_config_ == kDefault) {
|
|
|
|
option_config_ = kDBLogDir;
|
|
|
|
Destroy(last_options_);
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
Destroy(options);
|
|
|
|
options.create_if_missing = true;
|
|
|
|
Reopen(options);
|
|
|
|
return true;
|
|
|
|
} else if (option_config_ == kDBLogDir) {
|
|
|
|
option_config_ = kWalDirAndMmapReads;
|
|
|
|
Destroy(last_options_);
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
Destroy(options);
|
|
|
|
options.create_if_missing = true;
|
|
|
|
Reopen(options);
|
|
|
|
return true;
|
|
|
|
} else if (option_config_ == kWalDirAndMmapReads) {
|
|
|
|
option_config_ = kRecycleLogFiles;
|
|
|
|
Destroy(last_options_);
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
Destroy(options);
|
|
|
|
Reopen(options);
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Switch between different filter policy
|
|
|
|
// Jump from kDefault to kFilter to kFullFilter
|
|
|
|
bool DBTestBase::ChangeFilterOptions() {
|
|
|
|
if (option_config_ == kDefault) {
|
|
|
|
option_config_ = kFilter;
|
|
|
|
} else if (option_config_ == kFilter) {
|
|
|
|
option_config_ = kFullFilterWithNewTableReaderForCompactions;
|
|
|
|
} else if (option_config_ == kFullFilterWithNewTableReaderForCompactions) {
|
|
|
|
option_config_ = kPartitionedFilterWithNewTableReaderForCompactions;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
Destroy(last_options_);
|
|
|
|
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
TryReopen(options);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Switch between different DB options for file ingestion tests.
|
|
|
|
bool DBTestBase::ChangeOptionsForFileIngestionTest() {
|
|
|
|
if (option_config_ == kDefault) {
|
|
|
|
option_config_ = kUniversalCompaction;
|
|
|
|
Destroy(last_options_);
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
TryReopen(options);
|
|
|
|
return true;
|
|
|
|
} else if (option_config_ == kUniversalCompaction) {
|
|
|
|
option_config_ = kUniversalCompactionMultiLevel;
|
|
|
|
Destroy(last_options_);
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
TryReopen(options);
|
|
|
|
return true;
|
|
|
|
} else if (option_config_ == kUniversalCompactionMultiLevel) {
|
|
|
|
option_config_ = kLevelSubcompactions;
|
|
|
|
Destroy(last_options_);
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
assert(options.max_subcompactions > 1);
|
|
|
|
TryReopen(options);
|
|
|
|
return true;
|
|
|
|
} else if (option_config_ == kLevelSubcompactions) {
|
|
|
|
option_config_ = kUniversalSubcompactions;
|
|
|
|
Destroy(last_options_);
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
assert(options.max_subcompactions > 1);
|
|
|
|
TryReopen(options);
|
|
|
|
return true;
|
|
|
|
} else if (option_config_ == kUniversalSubcompactions) {
|
|
|
|
option_config_ = kDirectIO;
|
|
|
|
Destroy(last_options_);
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
TryReopen(options);
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the current option configuration.
|
|
|
|
Options DBTestBase::CurrentOptions(
|
|
|
|
const anon::OptionsOverride& options_override) const {
|
|
|
|
return GetOptions(option_config_, GetDefaultOptions(), options_override);
|
|
|
|
}
|
|
|
|
|
|
|
|
Options DBTestBase::CurrentOptions(
|
|
|
|
const Options& default_options,
|
|
|
|
const anon::OptionsOverride& options_override) const {
|
|
|
|
return GetOptions(option_config_, default_options, options_override);
|
|
|
|
}
|
|
|
|
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
4 years ago
|
|
|
Options DBTestBase::GetDefaultOptions() const {
|
|
|
|
Options options;
|
|
|
|
options.write_buffer_size = 4090 * 4096;
|
|
|
|
options.target_file_size_base = 2 * 1024 * 1024;
|
|
|
|
options.max_bytes_for_level_base = 10 * 1024 * 1024;
|
|
|
|
options.max_open_files = 5000;
|
|
|
|
options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
|
|
|
|
options.compaction_pri = CompactionPri::kByCompensatedSize;
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
4 years ago
|
|
|
options.env = env_;
|
|
|
|
if (!env_->skip_fsync_) {
|
|
|
|
options.track_and_verify_wals_in_manifest = true;
|
|
|
|
}
|
|
|
|
return options;
|
|
|
|
}
|
|
|
|
|
|
|
|
Options DBTestBase::GetOptions(
|
|
|
|
int option_config, const Options& default_options,
|
|
|
|
const anon::OptionsOverride& options_override) const {
|
|
|
|
// this redundant copy is to minimize code change w/o having lint error.
|
|
|
|
Options options = default_options;
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
bool set_block_based_table_factory = true;
|
|
|
|
#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
|
|
|
|
!defined(OS_AIX)
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
|
|
|
|
"NewRandomAccessFile:O_DIRECT");
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
|
|
|
|
"NewWritableFile:O_DIRECT");
|
|
|
|
#endif
|
Eliminate unnecessary (slow) block cache Ref()ing in MultiGet (#9899)
Summary:
When MultiGet() determines that multiple query keys can be
served by examining the same data block in block cache (one Lookup()),
each PinnableSlice referring to data in that data block needs to hold
on to the block in cache so that they can be released at arbitrary
times by the API user. Historically this is accomplished with extra
calls to Ref() on the Handle from Lookup(), with each PinnableSlice
cleanup calling Release() on the Handle, but this creates extra
contention on the block cache for the extra Ref()s and Release()es,
especially because they hit the same cache shard repeatedly.
In the case of merge operands (possibly more cases?), the problem was
compounded by doing an extra Ref()+eventual Release() for each merge
operand for a key reusing a block (which could be the same key!), rather
than one Ref() per key. (Note: the non-shared case with `biter` was
already one per key.)
This change optimizes MultiGet not to rely on these extra, contentious
Ref()+Release() calls by instead, in the shared block case, wrapping
the cache Release() cleanup in a refcounted object referenced by the
PinnableSlices, such that after the last wrapped reference is released,
the cache entry is Release()ed. Relaxed atomic refcounts should be
much faster than mutex-guarded Ref() and Release(), and much less prone
to a performance cliff when MultiGet() does a lot of block sharing.
Note that I did not use std::shared_ptr, because that would require an
extra indirection object (shared_ptr itself new/delete) in order to
associate a ref increment/decrement with a Cleanable cleanup entry. (If
I assumed it was the size of two pointers, I could do some hackery to
make it work without the extra indirection, but that's too fragile.)
Some details:
* Fixed (removed) extra block cache tracing entries in cases of cache
entry reuse in MultiGet, but it's likely that in some other cases traces
are missing (XXX comment inserted)
* Moved existing implementations for cleanable.h from iterator.cc to
new cleanable.cc
* Improved API comments on Cleanable
* Added a public SharedCleanablePtr class to cleanable.h in case others
could benefit from the same pattern (potentially many Cleanables and/or
smart pointers referencing a shared Cleanable)
* Add a typedef for MultiGetContext::Mask
* Some variable renaming for clarity
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9899
Test Plan:
Added unit tests for SharedCleanablePtr.
Greatly enhanced ability of existing tests to detect cache use-after-free.
* Release PinnableSlices from MultiGet as they are read rather than in
bulk (in db_test_util wrapper).
* In ASAN build, default to using a trivially small LRUCache for block_cache
so that entries are immediately erased when unreferenced. (Updated two
tests that depend on caching.) New ASAN testsuite running time seems
OK to me.
If I introduce a bug into my implementation where we skip the shared
cleanups on block reuse, ASAN detects the bug in
`db_basic_test *MultiGet*`. If I remove either of the above testing
enhancements, the bug is not detected.
Consider for follow-up work: manipulate or randomize ordering of
PinnableSlice use and release from MultiGet db_test_util wrapper. But in
typical cases, natural ordering gives pretty good functional coverage.
Performance test:
In the extreme (but possible) case of MultiGetting the same or adjacent keys
in a batch, throughput can improve by an order of magnitude.
`./db_bench -benchmarks=multireadrandom -db=/dev/shm/testdb -readonly -num=5 -duration=10 -threads=20 -multiread_batched -batch_size=200`
Before ops/sec, num=5: 1,384,394
Before ops/sec, num=500: 6,423,720
After ops/sec, num=500: 10,658,794
After ops/sec, num=5: 16,027,257
Also note that previously, with high parallelism, having query keys
concentrated in a single block was worse than spreading them out a bit. Now
concentrated in a single block is faster than spread out, which is hopefully
consistent with natural expectation.
Random query performance: with num=1000000, over 999 x 10s runs running before & after simultaneously (each -threads=12):
Before: multireadrandom [AVG 999 runs] : 1088699 (± 7344) ops/sec; 120.4 (± 0.8 ) MB/sec
After: multireadrandom [AVG 999 runs] : 1090402 (± 7230) ops/sec; 120.6 (± 0.8 ) MB/sec
Possibly better, possibly in the noise.
Reviewed By: anand1976
Differential Revision: D35907003
Pulled By: pdillinger
fbshipit-source-id: bbd244d703649a8ca12d476f2d03853ed9d1a17e
3 years ago
|
|
|
// kMustFreeHeapAllocations -> indicates ASAN build
|
|
|
|
if (kMustFreeHeapAllocations && !options_override.full_block_cache) {
|
|
|
|
// Detecting block cache use-after-free is normally difficult in unit
|
|
|
|
// tests, because as a cache, it tends to keep unreferenced entries in
|
|
|
|
// memory, and we normally want unit tests to take advantage of block
|
|
|
|
// cache for speed. However, we also want a strong chance of detecting
|
|
|
|
// block cache use-after-free in unit tests in ASAN builds, so for ASAN
|
|
|
|
// builds we use a trivially small block cache to which entries can be
|
|
|
|
// added but are immediately freed on no more references.
|
|
|
|
table_options.block_cache = NewLRUCache(/* too small */ 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool can_allow_mmap = IsMemoryMappedAccessSupported();
|
|
|
|
switch (option_config) {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
case kHashSkipList:
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
|
|
|
|
options.memtable_factory.reset(NewHashSkipListRepFactory(16));
|
|
|
|
options.allow_concurrent_memtable_write = false;
|
|
|
|
options.unordered_write = false;
|
|
|
|
break;
|
|
|
|
case kPlainTableFirstBytePrefix:
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
4 years ago
|
|
|
options.table_factory.reset(NewPlainTableFactory());
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
|
|
|
|
options.allow_mmap_reads = can_allow_mmap;
|
|
|
|
options.max_sequential_skip_in_iterations = 999999;
|
|
|
|
set_block_based_table_factory = false;
|
|
|
|
break;
|
|
|
|
case kPlainTableCappedPrefix:
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
4 years ago
|
|
|
options.table_factory.reset(NewPlainTableFactory());
|
|
|
|
options.prefix_extractor.reset(NewCappedPrefixTransform(8));
|
|
|
|
options.allow_mmap_reads = can_allow_mmap;
|
|
|
|
options.max_sequential_skip_in_iterations = 999999;
|
|
|
|
set_block_based_table_factory = false;
|
|
|
|
break;
|
|
|
|
case kPlainTableCappedPrefixNonMmap:
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
4 years ago
|
|
|
options.table_factory.reset(NewPlainTableFactory());
|
|
|
|
options.prefix_extractor.reset(NewCappedPrefixTransform(8));
|
|
|
|
options.allow_mmap_reads = false;
|
|
|
|
options.max_sequential_skip_in_iterations = 999999;
|
|
|
|
set_block_based_table_factory = false;
|
|
|
|
break;
|
|
|
|
case kPlainTableAllBytesPrefix:
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
4 years ago
|
|
|
options.table_factory.reset(NewPlainTableFactory());
|
|
|
|
options.prefix_extractor.reset(NewNoopTransform());
|
|
|
|
options.allow_mmap_reads = can_allow_mmap;
|
|
|
|
options.max_sequential_skip_in_iterations = 999999;
|
|
|
|
set_block_based_table_factory = false;
|
|
|
|
break;
|
|
|
|
case kVectorRep:
|
|
|
|
options.memtable_factory.reset(new VectorRepFactory(100));
|
|
|
|
options.allow_concurrent_memtable_write = false;
|
|
|
|
options.unordered_write = false;
|
|
|
|
break;
|
|
|
|
case kHashLinkList:
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
|
|
|
|
options.memtable_factory.reset(
|
|
|
|
NewHashLinkListRepFactory(4, 0, 3, true, 4));
|
|
|
|
options.allow_concurrent_memtable_write = false;
|
|
|
|
options.unordered_write = false;
|
|
|
|
break;
|
|
|
|
case kDirectIO: {
|
|
|
|
options.use_direct_reads = true;
|
|
|
|
options.use_direct_io_for_flush_and_compaction = true;
|
|
|
|
options.compaction_readahead_size = 2 * 1024 * 1024;
|
|
|
|
SetupSyncPointsToMockDirectIO();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
case kMergePut:
|
|
|
|
options.merge_operator = MergeOperators::CreatePutOperator();
|
|
|
|
break;
|
|
|
|
case kFilter:
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
|
|
|
|
break;
|
|
|
|
case kFullFilterWithNewTableReaderForCompactions:
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
options.compaction_readahead_size = 10 * 1024 * 1024;
|
|
|
|
break;
|
|
|
|
case kPartitionedFilterWithNewTableReaderForCompactions:
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
table_options.partition_filters = true;
|
|
|
|
table_options.index_type =
|
|
|
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
|
|
|
|
options.compaction_readahead_size = 10 * 1024 * 1024;
|
|
|
|
break;
|
|
|
|
case kUncompressed:
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
break;
|
|
|
|
case kNumLevel_3:
|
|
|
|
options.num_levels = 3;
|
|
|
|
break;
|
|
|
|
case kDBLogDir:
|
|
|
|
options.db_log_dir = alternative_db_log_dir_;
|
|
|
|
break;
|
|
|
|
case kWalDirAndMmapReads:
|
|
|
|
options.wal_dir = alternative_wal_dir_;
|
|
|
|
// mmap reads should be orthogonal to WalDir setting, so we piggyback to
|
|
|
|
// this option config to test mmap reads as well
|
|
|
|
options.allow_mmap_reads = can_allow_mmap;
|
|
|
|
break;
|
|
|
|
case kManifestFileSize:
|
|
|
|
options.max_manifest_file_size = 50; // 50 bytes
|
|
|
|
break;
|
|
|
|
case kPerfOptions:
|
|
|
|
options.delayed_write_rate = 8 * 1024 * 1024;
|
|
|
|
options.report_bg_io_stats = true;
|
|
|
|
// TODO(3.13) -- test more options
|
|
|
|
break;
|
|
|
|
case kUniversalCompaction:
|
|
|
|
options.compaction_style = kCompactionStyleUniversal;
|
|
|
|
options.num_levels = 1;
|
|
|
|
break;
|
|
|
|
case kUniversalCompactionMultiLevel:
|
|
|
|
options.compaction_style = kCompactionStyleUniversal;
|
|
|
|
options.num_levels = 8;
|
|
|
|
break;
|
|
|
|
case kCompressedBlockCache:
|
|
|
|
options.allow_mmap_writes = can_allow_mmap;
|
|
|
|
table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
|
|
|
|
break;
|
|
|
|
case kInfiniteMaxOpenFiles:
|
|
|
|
options.max_open_files = -1;
|
|
|
|
break;
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
3 years ago
|
|
|
case kXXH3Checksum: {
|
|
|
|
table_options.checksum = kXXH3;
|
|
|
|
// Thrown in here for basic coverage:
|
|
|
|
options.DisableExtraChecks();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kFIFOCompaction: {
|
|
|
|
options.compaction_style = kCompactionStyleFIFO;
|
|
|
|
options.max_open_files = -1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kBlockBasedTableWithPrefixHashIndex: {
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kHashSearch;
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kBlockBasedTableWithWholeKeyHashIndex: {
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kHashSearch;
|
|
|
|
options.prefix_extractor.reset(NewNoopTransform());
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kBlockBasedTableWithPartitionedIndex: {
|
|
|
|
table_options.format_version = 3;
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
|
|
|
|
options.prefix_extractor.reset(NewNoopTransform());
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kBlockBasedTableWithPartitionedIndexFormat4: {
|
|
|
|
table_options.format_version = 4;
|
|
|
|
// Format 4 changes the binary index format. Since partitioned index is a
|
|
|
|
// super-set of simple indexes, we are also using kTwoLevelIndexSearch to
|
|
|
|
// test this format.
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
|
|
|
|
// The top-level index in partition filters are also affected by format 4.
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
table_options.partition_filters = true;
|
|
|
|
table_options.index_block_restart_interval = 8;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kBlockBasedTableWithIndexRestartInterval: {
|
|
|
|
table_options.index_block_restart_interval = 8;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kBlockBasedTableWithLatestFormat: {
|
|
|
|
// In case different from default
|
|
|
|
table_options.format_version = kLatestFormatVersion;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kOptimizeFiltersForHits: {
|
|
|
|
options.optimize_filters_for_hits = true;
|
|
|
|
set_block_based_table_factory = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kRowCache: {
|
|
|
|
options.row_cache = NewLRUCache(1024 * 1024);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kRecycleLogFiles: {
|
|
|
|
options.recycle_log_file_num = 2;
|
|
|
|
break;
|
|
|
|
}
|
Parallelize L0-L1 Compaction: Restructure Compaction Job
Summary:
As of now compactions involving files from Level 0 and Level 1 are single
threaded because the files in L0, although sorted, are not range partitioned like
the other levels. This means that during L0-L1 compaction each file from L1
needs to be merged with potentially all the files from L0.
This attempt to parallelize the L0-L1 compaction assigns a thread and a
corresponding iterator to each L1 file that then considers only the key range
found in that L1 file and only the L0 files that have those keys (and only the
specific portion of those L0 files in which those keys are found). In this way
the overlap is minimized and potentially eliminated between different iterators
focusing on the same files.
The first step is to restructure the compaction logic to break L0-L1 compactions
into multiple, smaller, sequential compactions. Eventually each of these smaller
jobs will be run simultaneously. Areas to pay extra attention to are
# Correct aggregation of compaction job statistics across multiple threads
# Proper opening/closing of output files (make sure each thread's is unique)
# Keys that span multiple L1 files
# Skewed distributions of keys within L0 files
Test Plan: Make and run db_test (newer version has separate compaction tests) and compaction_job_stats_test
Reviewers: igor, noetzli, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D42699
9 years ago
|
|
|
case kLevelSubcompactions: {
|
|
|
|
options.max_subcompactions = 4;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kUniversalSubcompactions: {
|
|
|
|
options.compaction_style = kCompactionStyleUniversal;
|
|
|
|
options.num_levels = 8;
|
|
|
|
options.max_subcompactions = 4;
|
Parallelize L0-L1 Compaction: Restructure Compaction Job
Summary:
As of now compactions involving files from Level 0 and Level 1 are single
threaded because the files in L0, although sorted, are not range partitioned like
the other levels. This means that during L0-L1 compaction each file from L1
needs to be merged with potentially all the files from L0.
This attempt to parallelize the L0-L1 compaction assigns a thread and a
corresponding iterator to each L1 file that then considers only the key range
found in that L1 file and only the L0 files that have those keys (and only the
specific portion of those L0 files in which those keys are found). In this way
the overlap is minimized and potentially eliminated between different iterators
focusing on the same files.
The first step is to restructure the compaction logic to break L0-L1 compactions
into multiple, smaller, sequential compactions. Eventually each of these smaller
jobs will be run simultaneously. Areas to pay extra attention to are
# Correct aggregation of compaction job statistics across multiple threads
# Proper opening/closing of output files (make sure each thread's is unique)
# Keys that span multiple L1 files
# Skewed distributions of keys within L0 files
Test Plan: Make and run db_test (newer version has separate compaction tests) and compaction_job_stats_test
Reviewers: igor, noetzli, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D42699
9 years ago
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kConcurrentSkipList: {
|
|
|
|
options.allow_concurrent_memtable_write = true;
|
|
|
|
options.enable_write_thread_adaptive_yield = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kPipelinedWrite: {
|
|
|
|
options.enable_pipelined_write = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kConcurrentWALWrites: {
|
|
|
|
// This options optimize 2PC commit path
|
|
|
|
options.two_write_queues = true;
|
|
|
|
options.manual_wal_flush = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kUnorderedWrite: {
|
|
|
|
options.allow_concurrent_memtable_write = false;
|
|
|
|
options.unordered_write = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (options_override.filter_policy) {
|
|
|
|
table_options.filter_policy = options_override.filter_policy;
|
|
|
|
table_options.partition_filters = options_override.partition_filters;
|
|
|
|
table_options.metadata_block_size = options_override.metadata_block_size;
|
|
|
|
}
|
|
|
|
if (set_block_based_table_factory) {
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
}
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.fail_if_options_file_error = true;
|
|
|
|
return options;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::CreateColumnFamilies(const std::vector<std::string>& cfs,
|
|
|
|
const Options& options) {
|
|
|
|
ColumnFamilyOptions cf_opts(options);
|
|
|
|
size_t cfi = handles_.size();
|
|
|
|
handles_.resize(cfi + cfs.size());
|
|
|
|
for (auto cf : cfs) {
|
|
|
|
Status s = db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::CreateAndReopenWithCF(const std::vector<std::string>& cfs,
|
|
|
|
const Options& options) {
|
|
|
|
CreateColumnFamilies(cfs, options);
|
|
|
|
std::vector<std::string> cfs_plus_default = cfs;
|
|
|
|
cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
|
|
|
|
ReopenWithColumnFamilies(cfs_plus_default, options);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
|
|
|
|
const std::vector<Options>& options) {
|
|
|
|
ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
|
|
|
|
const Options& options) {
|
|
|
|
ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
|
|
|
|
}
|
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
void DBTestBase::SetTimeElapseOnlySleepOnReopen(DBOptions* options) {
|
|
|
|
time_elapse_only_sleep_on_reopen_ = true;
|
|
|
|
|
|
|
|
// Need to disable stats dumping and persisting which also use
|
|
|
|
// RepeatableThread, which uses InstrumentedCondVar::TimedWaitInternal.
|
|
|
|
// With time_elapse_only_sleep_, this can hang on some platforms (MacOS)
|
|
|
|
// because (a) on some platforms, pthread_cond_timedwait does not appear
|
|
|
|
// to release the lock for other threads to operate if the deadline time
|
|
|
|
// is already passed, and (b) TimedWait calls are currently a bad abstraction
|
|
|
|
// because the deadline parameter is usually computed from Env time,
|
|
|
|
// but is interpreted in real clock time.
|
|
|
|
options->stats_dump_period_sec = 0;
|
|
|
|
options->stats_persist_period_sec = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::MaybeInstallTimeElapseOnlySleep(const DBOptions& options) {
|
|
|
|
if (time_elapse_only_sleep_on_reopen_) {
|
|
|
|
assert(options.env == env_ ||
|
|
|
|
static_cast_with_check<CompositeEnvWrapper>(options.env)
|
|
|
|
->env_target() == env_);
|
|
|
|
assert(options.stats_dump_period_sec == 0);
|
|
|
|
assert(options.stats_persist_period_sec == 0);
|
|
|
|
// We cannot set these before destroying the last DB because they might
|
|
|
|
// cause a deadlock or similar without the appropriate options set in
|
|
|
|
// the DB.
|
|
|
|
env_->time_elapse_only_sleep_ = true;
|
|
|
|
env_->no_slowdown_ = true;
|
|
|
|
} else {
|
|
|
|
// Going back in same test run is not yet supported, so no
|
|
|
|
// reset in this case.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::TryReopenWithColumnFamilies(
|
|
|
|
const std::vector<std::string>& cfs, const std::vector<Options>& options) {
|
|
|
|
Close();
|
|
|
|
EXPECT_EQ(cfs.size(), options.size());
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
for (size_t i = 0; i < cfs.size(); ++i) {
|
|
|
|
column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
|
|
|
|
}
|
|
|
|
DBOptions db_opts = DBOptions(options[0]);
|
|
|
|
last_options_ = options[0];
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
MaybeInstallTimeElapseOnlySleep(db_opts);
|
|
|
|
return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::TryReopenWithColumnFamilies(
|
|
|
|
const std::vector<std::string>& cfs, const Options& options) {
|
|
|
|
Close();
|
|
|
|
std::vector<Options> v_opts(cfs.size(), options);
|
|
|
|
return TryReopenWithColumnFamilies(cfs, v_opts);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::Reopen(const Options& options) {
|
|
|
|
ASSERT_OK(TryReopen(options));
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::Close() {
|
|
|
|
for (auto h : handles_) {
|
|
|
|
EXPECT_OK(db_->DestroyColumnFamilyHandle(h));
|
|
|
|
}
|
|
|
|
handles_.clear();
|
|
|
|
delete db_;
|
|
|
|
db_ = nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::DestroyAndReopen(const Options& options) {
|
|
|
|
// Destroy using last options
|
|
|
|
Destroy(last_options_);
|
|
|
|
Reopen(options);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) {
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
if (delete_cf_paths) {
|
|
|
|
for (size_t i = 0; i < handles_.size(); ++i) {
|
|
|
|
ColumnFamilyDescriptor cfdescriptor;
|
|
|
|
// GetDescriptor is not implemented for ROCKSDB_LITE
|
|
|
|
handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError();
|
|
|
|
column_families.push_back(cfdescriptor);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
ASSERT_OK(DestroyDB(dbname_, options, column_families));
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::ReadOnlyReopen(const Options& options) {
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
MaybeInstallTimeElapseOnlySleep(options);
|
|
|
|
return DB::OpenForReadOnly(options, dbname_, &db_);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::TryReopen(const Options& options) {
|
|
|
|
Close();
|
|
|
|
last_options_.table_factory.reset();
|
|
|
|
// Note: operator= is an unsafe approach here since it destructs
|
|
|
|
// std::shared_ptr in the same order of their creation, in contrast to
|
|
|
|
// destructors which destructs them in the opposite order of creation. One
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
4 years ago
|
|
|
// particular problem is that the cache destructor might invoke callback
|
|
|
|
// functions that use Option members such as statistics. To work around this
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
4 years ago
|
|
|
// problem, we manually call destructor of table_factory which eventually
|
|
|
|
// clears the block cache.
|
|
|
|
last_options_ = options;
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
MaybeInstallTimeElapseOnlySleep(options);
|
|
|
|
return DB::Open(options, dbname_, &db_);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool DBTestBase::IsDirectIOSupported() {
|
|
|
|
return test::IsDirectIOSupported(env_, dbname_);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool DBTestBase::IsMemoryMappedAccessSupported() const {
|
|
|
|
return (!encrypted_env_);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::Flush(int cf) {
|
|
|
|
if (cf == 0) {
|
|
|
|
return db_->Flush(FlushOptions());
|
|
|
|
} else {
|
|
|
|
return db_->Flush(FlushOptions(), handles_[cf]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::Flush(const std::vector<int>& cf_ids) {
|
|
|
|
std::vector<ColumnFamilyHandle*> cfhs;
|
|
|
|
std::for_each(cf_ids.begin(), cf_ids.end(),
|
|
|
|
[&cfhs, this](int id) { cfhs.emplace_back(handles_[id]); });
|
|
|
|
return db_->Flush(FlushOptions(), cfhs);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::Put(const Slice& k, const Slice& v, WriteOptions wo) {
|
|
|
|
if (kMergePut == option_config_) {
|
|
|
|
return db_->Merge(wo, k, v);
|
|
|
|
} else {
|
|
|
|
return db_->Put(wo, k, v);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::Put(int cf, const Slice& k, const Slice& v,
|
|
|
|
WriteOptions wo) {
|
|
|
|
if (kMergePut == option_config_) {
|
|
|
|
return db_->Merge(wo, handles_[cf], k, v);
|
|
|
|
} else {
|
|
|
|
return db_->Put(wo, handles_[cf], k, v);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::Merge(const Slice& k, const Slice& v, WriteOptions wo) {
|
|
|
|
return db_->Merge(wo, k, v);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::Merge(int cf, const Slice& k, const Slice& v,
|
|
|
|
WriteOptions wo) {
|
|
|
|
return db_->Merge(wo, handles_[cf], k, v);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::Delete(const std::string& k) {
|
|
|
|
return db_->Delete(WriteOptions(), k);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::Delete(int cf, const std::string& k) {
|
|
|
|
return db_->Delete(WriteOptions(), handles_[cf], k);
|
|
|
|
}
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
Status DBTestBase::SingleDelete(const std::string& k) {
|
|
|
|
return db_->SingleDelete(WriteOptions(), k);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::SingleDelete(int cf, const std::string& k) {
|
|
|
|
return db_->SingleDelete(WriteOptions(), handles_[cf], k);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string DBTestBase::Get(const std::string& k, const Snapshot* snapshot) {
|
|
|
|
ReadOptions options;
|
|
|
|
options.verify_checksums = true;
|
|
|
|
options.snapshot = snapshot;
|
|
|
|
std::string result;
|
|
|
|
Status s = db_->Get(options, k, &result);
|
|
|
|
if (s.IsNotFound()) {
|
|
|
|
result = "NOT_FOUND";
|
|
|
|
} else if (!s.ok()) {
|
|
|
|
result = s.ToString();
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string DBTestBase::Get(int cf, const std::string& k,
|
|
|
|
const Snapshot* snapshot) {
|
|
|
|
ReadOptions options;
|
|
|
|
options.verify_checksums = true;
|
|
|
|
options.snapshot = snapshot;
|
|
|
|
std::string result;
|
|
|
|
Status s = db_->Get(options, handles_[cf], k, &result);
|
|
|
|
if (s.IsNotFound()) {
|
|
|
|
result = "NOT_FOUND";
|
|
|
|
} else if (!s.ok()) {
|
|
|
|
result = s.ToString();
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
|
|
|
|
const std::vector<std::string>& k,
|
|
|
|
const Snapshot* snapshot,
|
|
|
|
const bool batched) {
|
|
|
|
ReadOptions options;
|
|
|
|
options.verify_checksums = true;
|
|
|
|
options.snapshot = snapshot;
|
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<std::string> result;
|
|
|
|
|
|
|
|
for (unsigned int i = 0; i < cfs.size(); ++i) {
|
|
|
|
handles.push_back(handles_[cfs[i]]);
|
|
|
|
keys.push_back(k[i]);
|
|
|
|
}
|
|
|
|
std::vector<Status> s;
|
|
|
|
if (!batched) {
|
|
|
|
s = db_->MultiGet(options, handles, keys, &result);
|
Eliminate unnecessary (slow) block cache Ref()ing in MultiGet (#9899)
Summary:
When MultiGet() determines that multiple query keys can be
served by examining the same data block in block cache (one Lookup()),
each PinnableSlice referring to data in that data block needs to hold
on to the block in cache so that they can be released at arbitrary
times by the API user. Historically this is accomplished with extra
calls to Ref() on the Handle from Lookup(), with each PinnableSlice
cleanup calling Release() on the Handle, but this creates extra
contention on the block cache for the extra Ref()s and Release()es,
especially because they hit the same cache shard repeatedly.
In the case of merge operands (possibly more cases?), the problem was
compounded by doing an extra Ref()+eventual Release() for each merge
operand for a key reusing a block (which could be the same key!), rather
than one Ref() per key. (Note: the non-shared case with `biter` was
already one per key.)
This change optimizes MultiGet not to rely on these extra, contentious
Ref()+Release() calls by instead, in the shared block case, wrapping
the cache Release() cleanup in a refcounted object referenced by the
PinnableSlices, such that after the last wrapped reference is released,
the cache entry is Release()ed. Relaxed atomic refcounts should be
much faster than mutex-guarded Ref() and Release(), and much less prone
to a performance cliff when MultiGet() does a lot of block sharing.
Note that I did not use std::shared_ptr, because that would require an
extra indirection object (shared_ptr itself new/delete) in order to
associate a ref increment/decrement with a Cleanable cleanup entry. (If
I assumed it was the size of two pointers, I could do some hackery to
make it work without the extra indirection, but that's too fragile.)
Some details:
* Fixed (removed) extra block cache tracing entries in cases of cache
entry reuse in MultiGet, but it's likely that in some other cases traces
are missing (XXX comment inserted)
* Moved existing implementations for cleanable.h from iterator.cc to
new cleanable.cc
* Improved API comments on Cleanable
* Added a public SharedCleanablePtr class to cleanable.h in case others
could benefit from the same pattern (potentially many Cleanables and/or
smart pointers referencing a shared Cleanable)
* Add a typedef for MultiGetContext::Mask
* Some variable renaming for clarity
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9899
Test Plan:
Added unit tests for SharedCleanablePtr.
Greatly enhanced ability of existing tests to detect cache use-after-free.
* Release PinnableSlices from MultiGet as they are read rather than in
bulk (in db_test_util wrapper).
* In ASAN build, default to using a trivially small LRUCache for block_cache
so that entries are immediately erased when unreferenced. (Updated two
tests that depend on caching.) New ASAN testsuite running time seems
OK to me.
If I introduce a bug into my implementation where we skip the shared
cleanups on block reuse, ASAN detects the bug in
`db_basic_test *MultiGet*`. If I remove either of the above testing
enhancements, the bug is not detected.
Consider for follow-up work: manipulate or randomize ordering of
PinnableSlice use and release from MultiGet db_test_util wrapper. But in
typical cases, natural ordering gives pretty good functional coverage.
Performance test:
In the extreme (but possible) case of MultiGetting the same or adjacent keys
in a batch, throughput can improve by an order of magnitude.
`./db_bench -benchmarks=multireadrandom -db=/dev/shm/testdb -readonly -num=5 -duration=10 -threads=20 -multiread_batched -batch_size=200`
Before ops/sec, num=5: 1,384,394
Before ops/sec, num=500: 6,423,720
After ops/sec, num=500: 10,658,794
After ops/sec, num=5: 16,027,257
Also note that previously, with high parallelism, having query keys
concentrated in a single block was worse than spreading them out a bit. Now
concentrated in a single block is faster than spread out, which is hopefully
consistent with natural expectation.
Random query performance: with num=1000000, over 999 x 10s runs running before & after simultaneously (each -threads=12):
Before: multireadrandom [AVG 999 runs] : 1088699 (± 7344) ops/sec; 120.4 (± 0.8 ) MB/sec
After: multireadrandom [AVG 999 runs] : 1090402 (± 7230) ops/sec; 120.6 (± 0.8 ) MB/sec
Possibly better, possibly in the noise.
Reviewed By: anand1976
Differential Revision: D35907003
Pulled By: pdillinger
fbshipit-source-id: bbd244d703649a8ca12d476f2d03853ed9d1a17e
3 years ago
|
|
|
for (size_t i = 0; i < s.size(); ++i) {
|
|
|
|
if (s[i].IsNotFound()) {
|
|
|
|
result[i] = "NOT_FOUND";
|
|
|
|
} else if (!s[i].ok()) {
|
|
|
|
result[i] = s[i].ToString();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
std::vector<PinnableSlice> pin_values(cfs.size());
|
|
|
|
result.resize(cfs.size());
|
|
|
|
s.resize(cfs.size());
|
|
|
|
db_->MultiGet(options, cfs.size(), handles.data(), keys.data(),
|
|
|
|
pin_values.data(), s.data());
|
Eliminate unnecessary (slow) block cache Ref()ing in MultiGet (#9899)
Summary:
When MultiGet() determines that multiple query keys can be
served by examining the same data block in block cache (one Lookup()),
each PinnableSlice referring to data in that data block needs to hold
on to the block in cache so that they can be released at arbitrary
times by the API user. Historically this is accomplished with extra
calls to Ref() on the Handle from Lookup(), with each PinnableSlice
cleanup calling Release() on the Handle, but this creates extra
contention on the block cache for the extra Ref()s and Release()es,
especially because they hit the same cache shard repeatedly.
In the case of merge operands (possibly more cases?), the problem was
compounded by doing an extra Ref()+eventual Release() for each merge
operand for a key reusing a block (which could be the same key!), rather
than one Ref() per key. (Note: the non-shared case with `biter` was
already one per key.)
This change optimizes MultiGet not to rely on these extra, contentious
Ref()+Release() calls by instead, in the shared block case, wrapping
the cache Release() cleanup in a refcounted object referenced by the
PinnableSlices, such that after the last wrapped reference is released,
the cache entry is Release()ed. Relaxed atomic refcounts should be
much faster than mutex-guarded Ref() and Release(), and much less prone
to a performance cliff when MultiGet() does a lot of block sharing.
Note that I did not use std::shared_ptr, because that would require an
extra indirection object (shared_ptr itself new/delete) in order to
associate a ref increment/decrement with a Cleanable cleanup entry. (If
I assumed it was the size of two pointers, I could do some hackery to
make it work without the extra indirection, but that's too fragile.)
Some details:
* Fixed (removed) extra block cache tracing entries in cases of cache
entry reuse in MultiGet, but it's likely that in some other cases traces
are missing (XXX comment inserted)
* Moved existing implementations for cleanable.h from iterator.cc to
new cleanable.cc
* Improved API comments on Cleanable
* Added a public SharedCleanablePtr class to cleanable.h in case others
could benefit from the same pattern (potentially many Cleanables and/or
smart pointers referencing a shared Cleanable)
* Add a typedef for MultiGetContext::Mask
* Some variable renaming for clarity
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9899
Test Plan:
Added unit tests for SharedCleanablePtr.
Greatly enhanced ability of existing tests to detect cache use-after-free.
* Release PinnableSlices from MultiGet as they are read rather than in
bulk (in db_test_util wrapper).
* In ASAN build, default to using a trivially small LRUCache for block_cache
so that entries are immediately erased when unreferenced. (Updated two
tests that depend on caching.) New ASAN testsuite running time seems
OK to me.
If I introduce a bug into my implementation where we skip the shared
cleanups on block reuse, ASAN detects the bug in
`db_basic_test *MultiGet*`. If I remove either of the above testing
enhancements, the bug is not detected.
Consider for follow-up work: manipulate or randomize ordering of
PinnableSlice use and release from MultiGet db_test_util wrapper. But in
typical cases, natural ordering gives pretty good functional coverage.
Performance test:
In the extreme (but possible) case of MultiGetting the same or adjacent keys
in a batch, throughput can improve by an order of magnitude.
`./db_bench -benchmarks=multireadrandom -db=/dev/shm/testdb -readonly -num=5 -duration=10 -threads=20 -multiread_batched -batch_size=200`
Before ops/sec, num=5: 1,384,394
Before ops/sec, num=500: 6,423,720
After ops/sec, num=500: 10,658,794
After ops/sec, num=5: 16,027,257
Also note that previously, with high parallelism, having query keys
concentrated in a single block was worse than spreading them out a bit. Now
concentrated in a single block is faster than spread out, which is hopefully
consistent with natural expectation.
Random query performance: with num=1000000, over 999 x 10s runs running before & after simultaneously (each -threads=12):
Before: multireadrandom [AVG 999 runs] : 1088699 (± 7344) ops/sec; 120.4 (± 0.8 ) MB/sec
After: multireadrandom [AVG 999 runs] : 1090402 (± 7230) ops/sec; 120.6 (± 0.8 ) MB/sec
Possibly better, possibly in the noise.
Reviewed By: anand1976
Differential Revision: D35907003
Pulled By: pdillinger
fbshipit-source-id: bbd244d703649a8ca12d476f2d03853ed9d1a17e
3 years ago
|
|
|
for (size_t i = 0; i < s.size(); ++i) {
|
|
|
|
if (s[i].IsNotFound()) {
|
|
|
|
result[i] = "NOT_FOUND";
|
|
|
|
} else if (!s[i].ok()) {
|
|
|
|
result[i] = s[i].ToString();
|
|
|
|
} else {
|
|
|
|
result[i].assign(pin_values[i].data(), pin_values[i].size());
|
Eliminate unnecessary (slow) block cache Ref()ing in MultiGet (#9899)
Summary:
When MultiGet() determines that multiple query keys can be
served by examining the same data block in block cache (one Lookup()),
each PinnableSlice referring to data in that data block needs to hold
on to the block in cache so that they can be released at arbitrary
times by the API user. Historically this is accomplished with extra
calls to Ref() on the Handle from Lookup(), with each PinnableSlice
cleanup calling Release() on the Handle, but this creates extra
contention on the block cache for the extra Ref()s and Release()es,
especially because they hit the same cache shard repeatedly.
In the case of merge operands (possibly more cases?), the problem was
compounded by doing an extra Ref()+eventual Release() for each merge
operand for a key reusing a block (which could be the same key!), rather
than one Ref() per key. (Note: the non-shared case with `biter` was
already one per key.)
This change optimizes MultiGet not to rely on these extra, contentious
Ref()+Release() calls by instead, in the shared block case, wrapping
the cache Release() cleanup in a refcounted object referenced by the
PinnableSlices, such that after the last wrapped reference is released,
the cache entry is Release()ed. Relaxed atomic refcounts should be
much faster than mutex-guarded Ref() and Release(), and much less prone
to a performance cliff when MultiGet() does a lot of block sharing.
Note that I did not use std::shared_ptr, because that would require an
extra indirection object (shared_ptr itself new/delete) in order to
associate a ref increment/decrement with a Cleanable cleanup entry. (If
I assumed it was the size of two pointers, I could do some hackery to
make it work without the extra indirection, but that's too fragile.)
Some details:
* Fixed (removed) extra block cache tracing entries in cases of cache
entry reuse in MultiGet, but it's likely that in some other cases traces
are missing (XXX comment inserted)
* Moved existing implementations for cleanable.h from iterator.cc to
new cleanable.cc
* Improved API comments on Cleanable
* Added a public SharedCleanablePtr class to cleanable.h in case others
could benefit from the same pattern (potentially many Cleanables and/or
smart pointers referencing a shared Cleanable)
* Add a typedef for MultiGetContext::Mask
* Some variable renaming for clarity
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9899
Test Plan:
Added unit tests for SharedCleanablePtr.
Greatly enhanced ability of existing tests to detect cache use-after-free.
* Release PinnableSlices from MultiGet as they are read rather than in
bulk (in db_test_util wrapper).
* In ASAN build, default to using a trivially small LRUCache for block_cache
so that entries are immediately erased when unreferenced. (Updated two
tests that depend on caching.) New ASAN testsuite running time seems
OK to me.
If I introduce a bug into my implementation where we skip the shared
cleanups on block reuse, ASAN detects the bug in
`db_basic_test *MultiGet*`. If I remove either of the above testing
enhancements, the bug is not detected.
Consider for follow-up work: manipulate or randomize ordering of
PinnableSlice use and release from MultiGet db_test_util wrapper. But in
typical cases, natural ordering gives pretty good functional coverage.
Performance test:
In the extreme (but possible) case of MultiGetting the same or adjacent keys
in a batch, throughput can improve by an order of magnitude.
`./db_bench -benchmarks=multireadrandom -db=/dev/shm/testdb -readonly -num=5 -duration=10 -threads=20 -multiread_batched -batch_size=200`
Before ops/sec, num=5: 1,384,394
Before ops/sec, num=500: 6,423,720
After ops/sec, num=500: 10,658,794
After ops/sec, num=5: 16,027,257
Also note that previously, with high parallelism, having query keys
concentrated in a single block was worse than spreading them out a bit. Now
concentrated in a single block is faster than spread out, which is hopefully
consistent with natural expectation.
Random query performance: with num=1000000, over 999 x 10s runs running before & after simultaneously (each -threads=12):
Before: multireadrandom [AVG 999 runs] : 1088699 (± 7344) ops/sec; 120.4 (± 0.8 ) MB/sec
After: multireadrandom [AVG 999 runs] : 1090402 (± 7230) ops/sec; 120.6 (± 0.8 ) MB/sec
Possibly better, possibly in the noise.
Reviewed By: anand1976
Differential Revision: D35907003
Pulled By: pdillinger
fbshipit-source-id: bbd244d703649a8ca12d476f2d03853ed9d1a17e
3 years ago
|
|
|
// Increase likelihood of detecting potential use-after-free bugs with
|
|
|
|
// PinnableSlices tracking the same resource
|
|
|
|
pin_values[i].Reset();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
|
|
|
std::vector<std::string> DBTestBase::MultiGet(const std::vector<std::string>& k,
|
|
|
|
const Snapshot* snapshot) {
|
|
|
|
ReadOptions options;
|
|
|
|
options.verify_checksums = true;
|
|
|
|
options.snapshot = snapshot;
|
|
|
|
std::vector<Slice> keys;
|
Eliminate unnecessary (slow) block cache Ref()ing in MultiGet (#9899)
Summary:
When MultiGet() determines that multiple query keys can be
served by examining the same data block in block cache (one Lookup()),
each PinnableSlice referring to data in that data block needs to hold
on to the block in cache so that they can be released at arbitrary
times by the API user. Historically this is accomplished with extra
calls to Ref() on the Handle from Lookup(), with each PinnableSlice
cleanup calling Release() on the Handle, but this creates extra
contention on the block cache for the extra Ref()s and Release()es,
especially because they hit the same cache shard repeatedly.
In the case of merge operands (possibly more cases?), the problem was
compounded by doing an extra Ref()+eventual Release() for each merge
operand for a key reusing a block (which could be the same key!), rather
than one Ref() per key. (Note: the non-shared case with `biter` was
already one per key.)
This change optimizes MultiGet not to rely on these extra, contentious
Ref()+Release() calls by instead, in the shared block case, wrapping
the cache Release() cleanup in a refcounted object referenced by the
PinnableSlices, such that after the last wrapped reference is released,
the cache entry is Release()ed. Relaxed atomic refcounts should be
much faster than mutex-guarded Ref() and Release(), and much less prone
to a performance cliff when MultiGet() does a lot of block sharing.
Note that I did not use std::shared_ptr, because that would require an
extra indirection object (shared_ptr itself new/delete) in order to
associate a ref increment/decrement with a Cleanable cleanup entry. (If
I assumed it was the size of two pointers, I could do some hackery to
make it work without the extra indirection, but that's too fragile.)
Some details:
* Fixed (removed) extra block cache tracing entries in cases of cache
entry reuse in MultiGet, but it's likely that in some other cases traces
are missing (XXX comment inserted)
* Moved existing implementations for cleanable.h from iterator.cc to
new cleanable.cc
* Improved API comments on Cleanable
* Added a public SharedCleanablePtr class to cleanable.h in case others
could benefit from the same pattern (potentially many Cleanables and/or
smart pointers referencing a shared Cleanable)
* Add a typedef for MultiGetContext::Mask
* Some variable renaming for clarity
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9899
Test Plan:
Added unit tests for SharedCleanablePtr.
Greatly enhanced ability of existing tests to detect cache use-after-free.
* Release PinnableSlices from MultiGet as they are read rather than in
bulk (in db_test_util wrapper).
* In ASAN build, default to using a trivially small LRUCache for block_cache
so that entries are immediately erased when unreferenced. (Updated two
tests that depend on caching.) New ASAN testsuite running time seems
OK to me.
If I introduce a bug into my implementation where we skip the shared
cleanups on block reuse, ASAN detects the bug in
`db_basic_test *MultiGet*`. If I remove either of the above testing
enhancements, the bug is not detected.
Consider for follow-up work: manipulate or randomize ordering of
PinnableSlice use and release from MultiGet db_test_util wrapper. But in
typical cases, natural ordering gives pretty good functional coverage.
Performance test:
In the extreme (but possible) case of MultiGetting the same or adjacent keys
in a batch, throughput can improve by an order of magnitude.
`./db_bench -benchmarks=multireadrandom -db=/dev/shm/testdb -readonly -num=5 -duration=10 -threads=20 -multiread_batched -batch_size=200`
Before ops/sec, num=5: 1,384,394
Before ops/sec, num=500: 6,423,720
After ops/sec, num=500: 10,658,794
After ops/sec, num=5: 16,027,257
Also note that previously, with high parallelism, having query keys
concentrated in a single block was worse than spreading them out a bit. Now
concentrated in a single block is faster than spread out, which is hopefully
consistent with natural expectation.
Random query performance: with num=1000000, over 999 x 10s runs running before & after simultaneously (each -threads=12):
Before: multireadrandom [AVG 999 runs] : 1088699 (± 7344) ops/sec; 120.4 (± 0.8 ) MB/sec
After: multireadrandom [AVG 999 runs] : 1090402 (± 7230) ops/sec; 120.6 (± 0.8 ) MB/sec
Possibly better, possibly in the noise.
Reviewed By: anand1976
Differential Revision: D35907003
Pulled By: pdillinger
fbshipit-source-id: bbd244d703649a8ca12d476f2d03853ed9d1a17e
3 years ago
|
|
|
std::vector<std::string> result(k.size());
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
|
|
|
std::vector<Status> statuses(k.size());
|
|
|
|
std::vector<PinnableSlice> pin_values(k.size());
|
|
|
|
|
Eliminate unnecessary (slow) block cache Ref()ing in MultiGet (#9899)
Summary:
When MultiGet() determines that multiple query keys can be
served by examining the same data block in block cache (one Lookup()),
each PinnableSlice referring to data in that data block needs to hold
on to the block in cache so that they can be released at arbitrary
times by the API user. Historically this is accomplished with extra
calls to Ref() on the Handle from Lookup(), with each PinnableSlice
cleanup calling Release() on the Handle, but this creates extra
contention on the block cache for the extra Ref()s and Release()es,
especially because they hit the same cache shard repeatedly.
In the case of merge operands (possibly more cases?), the problem was
compounded by doing an extra Ref()+eventual Release() for each merge
operand for a key reusing a block (which could be the same key!), rather
than one Ref() per key. (Note: the non-shared case with `biter` was
already one per key.)
This change optimizes MultiGet not to rely on these extra, contentious
Ref()+Release() calls by instead, in the shared block case, wrapping
the cache Release() cleanup in a refcounted object referenced by the
PinnableSlices, such that after the last wrapped reference is released,
the cache entry is Release()ed. Relaxed atomic refcounts should be
much faster than mutex-guarded Ref() and Release(), and much less prone
to a performance cliff when MultiGet() does a lot of block sharing.
Note that I did not use std::shared_ptr, because that would require an
extra indirection object (shared_ptr itself new/delete) in order to
associate a ref increment/decrement with a Cleanable cleanup entry. (If
I assumed it was the size of two pointers, I could do some hackery to
make it work without the extra indirection, but that's too fragile.)
Some details:
* Fixed (removed) extra block cache tracing entries in cases of cache
entry reuse in MultiGet, but it's likely that in some other cases traces
are missing (XXX comment inserted)
* Moved existing implementations for cleanable.h from iterator.cc to
new cleanable.cc
* Improved API comments on Cleanable
* Added a public SharedCleanablePtr class to cleanable.h in case others
could benefit from the same pattern (potentially many Cleanables and/or
smart pointers referencing a shared Cleanable)
* Add a typedef for MultiGetContext::Mask
* Some variable renaming for clarity
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9899
Test Plan:
Added unit tests for SharedCleanablePtr.
Greatly enhanced ability of existing tests to detect cache use-after-free.
* Release PinnableSlices from MultiGet as they are read rather than in
bulk (in db_test_util wrapper).
* In ASAN build, default to using a trivially small LRUCache for block_cache
so that entries are immediately erased when unreferenced. (Updated two
tests that depend on caching.) New ASAN testsuite running time seems
OK to me.
If I introduce a bug into my implementation where we skip the shared
cleanups on block reuse, ASAN detects the bug in
`db_basic_test *MultiGet*`. If I remove either of the above testing
enhancements, the bug is not detected.
Consider for follow-up work: manipulate or randomize ordering of
PinnableSlice use and release from MultiGet db_test_util wrapper. But in
typical cases, natural ordering gives pretty good functional coverage.
Performance test:
In the extreme (but possible) case of MultiGetting the same or adjacent keys
in a batch, throughput can improve by an order of magnitude.
`./db_bench -benchmarks=multireadrandom -db=/dev/shm/testdb -readonly -num=5 -duration=10 -threads=20 -multiread_batched -batch_size=200`
Before ops/sec, num=5: 1,384,394
Before ops/sec, num=500: 6,423,720
After ops/sec, num=500: 10,658,794
After ops/sec, num=5: 16,027,257
Also note that previously, with high parallelism, having query keys
concentrated in a single block was worse than spreading them out a bit. Now
concentrated in a single block is faster than spread out, which is hopefully
consistent with natural expectation.
Random query performance: with num=1000000, over 999 x 10s runs running before & after simultaneously (each -threads=12):
Before: multireadrandom [AVG 999 runs] : 1088699 (± 7344) ops/sec; 120.4 (± 0.8 ) MB/sec
After: multireadrandom [AVG 999 runs] : 1090402 (± 7230) ops/sec; 120.6 (± 0.8 ) MB/sec
Possibly better, possibly in the noise.
Reviewed By: anand1976
Differential Revision: D35907003
Pulled By: pdillinger
fbshipit-source-id: bbd244d703649a8ca12d476f2d03853ed9d1a17e
3 years ago
|
|
|
for (size_t i = 0; i < k.size(); ++i) {
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
|
|
|
keys.push_back(k[i]);
|
|
|
|
}
|
|
|
|
db_->MultiGet(options, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), pin_values.data(), statuses.data());
|
Eliminate unnecessary (slow) block cache Ref()ing in MultiGet (#9899)
Summary:
When MultiGet() determines that multiple query keys can be
served by examining the same data block in block cache (one Lookup()),
each PinnableSlice referring to data in that data block needs to hold
on to the block in cache so that they can be released at arbitrary
times by the API user. Historically this is accomplished with extra
calls to Ref() on the Handle from Lookup(), with each PinnableSlice
cleanup calling Release() on the Handle, but this creates extra
contention on the block cache for the extra Ref()s and Release()es,
especially because they hit the same cache shard repeatedly.
In the case of merge operands (possibly more cases?), the problem was
compounded by doing an extra Ref()+eventual Release() for each merge
operand for a key reusing a block (which could be the same key!), rather
than one Ref() per key. (Note: the non-shared case with `biter` was
already one per key.)
This change optimizes MultiGet not to rely on these extra, contentious
Ref()+Release() calls by instead, in the shared block case, wrapping
the cache Release() cleanup in a refcounted object referenced by the
PinnableSlices, such that after the last wrapped reference is released,
the cache entry is Release()ed. Relaxed atomic refcounts should be
much faster than mutex-guarded Ref() and Release(), and much less prone
to a performance cliff when MultiGet() does a lot of block sharing.
Note that I did not use std::shared_ptr, because that would require an
extra indirection object (shared_ptr itself new/delete) in order to
associate a ref increment/decrement with a Cleanable cleanup entry. (If
I assumed it was the size of two pointers, I could do some hackery to
make it work without the extra indirection, but that's too fragile.)
Some details:
* Fixed (removed) extra block cache tracing entries in cases of cache
entry reuse in MultiGet, but it's likely that in some other cases traces
are missing (XXX comment inserted)
* Moved existing implementations for cleanable.h from iterator.cc to
new cleanable.cc
* Improved API comments on Cleanable
* Added a public SharedCleanablePtr class to cleanable.h in case others
could benefit from the same pattern (potentially many Cleanables and/or
smart pointers referencing a shared Cleanable)
* Add a typedef for MultiGetContext::Mask
* Some variable renaming for clarity
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9899
Test Plan:
Added unit tests for SharedCleanablePtr.
Greatly enhanced ability of existing tests to detect cache use-after-free.
* Release PinnableSlices from MultiGet as they are read rather than in
bulk (in db_test_util wrapper).
* In ASAN build, default to using a trivially small LRUCache for block_cache
so that entries are immediately erased when unreferenced. (Updated two
tests that depend on caching.) New ASAN testsuite running time seems
OK to me.
If I introduce a bug into my implementation where we skip the shared
cleanups on block reuse, ASAN detects the bug in
`db_basic_test *MultiGet*`. If I remove either of the above testing
enhancements, the bug is not detected.
Consider for follow-up work: manipulate or randomize ordering of
PinnableSlice use and release from MultiGet db_test_util wrapper. But in
typical cases, natural ordering gives pretty good functional coverage.
Performance test:
In the extreme (but possible) case of MultiGetting the same or adjacent keys
in a batch, throughput can improve by an order of magnitude.
`./db_bench -benchmarks=multireadrandom -db=/dev/shm/testdb -readonly -num=5 -duration=10 -threads=20 -multiread_batched -batch_size=200`
Before ops/sec, num=5: 1,384,394
Before ops/sec, num=500: 6,423,720
After ops/sec, num=500: 10,658,794
After ops/sec, num=5: 16,027,257
Also note that previously, with high parallelism, having query keys
concentrated in a single block was worse than spreading them out a bit. Now
concentrated in a single block is faster than spread out, which is hopefully
consistent with natural expectation.
Random query performance: with num=1000000, over 999 x 10s runs running before & after simultaneously (each -threads=12):
Before: multireadrandom [AVG 999 runs] : 1088699 (± 7344) ops/sec; 120.4 (± 0.8 ) MB/sec
After: multireadrandom [AVG 999 runs] : 1090402 (± 7230) ops/sec; 120.6 (± 0.8 ) MB/sec
Possibly better, possibly in the noise.
Reviewed By: anand1976
Differential Revision: D35907003
Pulled By: pdillinger
fbshipit-source-id: bbd244d703649a8ca12d476f2d03853ed9d1a17e
3 years ago
|
|
|
for (size_t i = 0; i < statuses.size(); ++i) {
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
|
|
|
if (statuses[i].IsNotFound()) {
|
|
|
|
result[i] = "NOT_FOUND";
|
Eliminate unnecessary (slow) block cache Ref()ing in MultiGet (#9899)
Summary:
When MultiGet() determines that multiple query keys can be
served by examining the same data block in block cache (one Lookup()),
each PinnableSlice referring to data in that data block needs to hold
on to the block in cache so that they can be released at arbitrary
times by the API user. Historically this is accomplished with extra
calls to Ref() on the Handle from Lookup(), with each PinnableSlice
cleanup calling Release() on the Handle, but this creates extra
contention on the block cache for the extra Ref()s and Release()es,
especially because they hit the same cache shard repeatedly.
In the case of merge operands (possibly more cases?), the problem was
compounded by doing an extra Ref()+eventual Release() for each merge
operand for a key reusing a block (which could be the same key!), rather
than one Ref() per key. (Note: the non-shared case with `biter` was
already one per key.)
This change optimizes MultiGet not to rely on these extra, contentious
Ref()+Release() calls by instead, in the shared block case, wrapping
the cache Release() cleanup in a refcounted object referenced by the
PinnableSlices, such that after the last wrapped reference is released,
the cache entry is Release()ed. Relaxed atomic refcounts should be
much faster than mutex-guarded Ref() and Release(), and much less prone
to a performance cliff when MultiGet() does a lot of block sharing.
Note that I did not use std::shared_ptr, because that would require an
extra indirection object (shared_ptr itself new/delete) in order to
associate a ref increment/decrement with a Cleanable cleanup entry. (If
I assumed it was the size of two pointers, I could do some hackery to
make it work without the extra indirection, but that's too fragile.)
Some details:
* Fixed (removed) extra block cache tracing entries in cases of cache
entry reuse in MultiGet, but it's likely that in some other cases traces
are missing (XXX comment inserted)
* Moved existing implementations for cleanable.h from iterator.cc to
new cleanable.cc
* Improved API comments on Cleanable
* Added a public SharedCleanablePtr class to cleanable.h in case others
could benefit from the same pattern (potentially many Cleanables and/or
smart pointers referencing a shared Cleanable)
* Add a typedef for MultiGetContext::Mask
* Some variable renaming for clarity
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9899
Test Plan:
Added unit tests for SharedCleanablePtr.
Greatly enhanced ability of existing tests to detect cache use-after-free.
* Release PinnableSlices from MultiGet as they are read rather than in
bulk (in db_test_util wrapper).
* In ASAN build, default to using a trivially small LRUCache for block_cache
so that entries are immediately erased when unreferenced. (Updated two
tests that depend on caching.) New ASAN testsuite running time seems
OK to me.
If I introduce a bug into my implementation where we skip the shared
cleanups on block reuse, ASAN detects the bug in
`db_basic_test *MultiGet*`. If I remove either of the above testing
enhancements, the bug is not detected.
Consider for follow-up work: manipulate or randomize ordering of
PinnableSlice use and release from MultiGet db_test_util wrapper. But in
typical cases, natural ordering gives pretty good functional coverage.
Performance test:
In the extreme (but possible) case of MultiGetting the same or adjacent keys
in a batch, throughput can improve by an order of magnitude.
`./db_bench -benchmarks=multireadrandom -db=/dev/shm/testdb -readonly -num=5 -duration=10 -threads=20 -multiread_batched -batch_size=200`
Before ops/sec, num=5: 1,384,394
Before ops/sec, num=500: 6,423,720
After ops/sec, num=500: 10,658,794
After ops/sec, num=5: 16,027,257
Also note that previously, with high parallelism, having query keys
concentrated in a single block was worse than spreading them out a bit. Now
concentrated in a single block is faster than spread out, which is hopefully
consistent with natural expectation.
Random query performance: with num=1000000, over 999 x 10s runs running before & after simultaneously (each -threads=12):
Before: multireadrandom [AVG 999 runs] : 1088699 (± 7344) ops/sec; 120.4 (± 0.8 ) MB/sec
After: multireadrandom [AVG 999 runs] : 1090402 (± 7230) ops/sec; 120.6 (± 0.8 ) MB/sec
Possibly better, possibly in the noise.
Reviewed By: anand1976
Differential Revision: D35907003
Pulled By: pdillinger
fbshipit-source-id: bbd244d703649a8ca12d476f2d03853ed9d1a17e
3 years ago
|
|
|
} else if (!statuses[i].ok()) {
|
|
|
|
result[i] = statuses[i].ToString();
|
|
|
|
} else {
|
|
|
|
result[i].assign(pin_values[i].data(), pin_values[i].size());
|
|
|
|
// Increase likelihood of detecting potential use-after-free bugs with
|
|
|
|
// PinnableSlices tracking the same resource
|
|
|
|
pin_values[i].Reset();
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::Get(const std::string& k, PinnableSlice* v) {
|
|
|
|
ReadOptions options;
|
|
|
|
options.verify_checksums = true;
|
|
|
|
Status s = dbfull()->Get(options, dbfull()->DefaultColumnFamily(), k, v);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t DBTestBase::GetNumSnapshots() {
|
|
|
|
uint64_t int_num;
|
|
|
|
EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num));
|
|
|
|
return int_num;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t DBTestBase::GetTimeOldestSnapshots() {
|
|
|
|
uint64_t int_num;
|
|
|
|
EXPECT_TRUE(
|
|
|
|
dbfull()->GetIntProperty("rocksdb.oldest-snapshot-time", &int_num));
|
|
|
|
return int_num;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t DBTestBase::GetSequenceOldestSnapshots() {
|
|
|
|
uint64_t int_num;
|
|
|
|
EXPECT_TRUE(
|
|
|
|
dbfull()->GetIntProperty("rocksdb.oldest-snapshot-sequence", &int_num));
|
|
|
|
return int_num;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return a string that contains all key,value pairs in order,
|
|
|
|
// formatted like "(k1->v1)(k2->v2)".
|
|
|
|
std::string DBTestBase::Contents(int cf) {
|
|
|
|
std::vector<std::string> forward;
|
|
|
|
std::string result;
|
|
|
|
Iterator* iter = (cf == 0) ? db_->NewIterator(ReadOptions())
|
|
|
|
: db_->NewIterator(ReadOptions(), handles_[cf]);
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
std::string s = IterStatus(iter);
|
|
|
|
result.push_back('(');
|
|
|
|
result.append(s);
|
|
|
|
result.push_back(')');
|
|
|
|
forward.push_back(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check reverse iteration results are the reverse of forward results
|
|
|
|
unsigned int matched = 0;
|
|
|
|
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
|
|
|
|
EXPECT_LT(matched, forward.size());
|
|
|
|
EXPECT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
|
|
|
|
matched++;
|
|
|
|
}
|
|
|
|
EXPECT_EQ(matched, forward.size());
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) {
|
|
|
|
Arena arena;
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
InternalKeyComparator icmp(options.comparator);
|
|
|
|
ReadRangeDelAggregator range_del_agg(&icmp,
|
|
|
|
kMaxSequenceNumber /* upper_bound */);
|
|
|
|
ReadOptions read_options;
|
|
|
|
ScopedArenaIterator iter;
|
|
|
|
if (cf == 0) {
|
|
|
|
iter.set(dbfull()->NewInternalIterator(read_options, &arena, &range_del_agg,
|
|
|
|
kMaxSequenceNumber));
|
|
|
|
} else {
|
|
|
|
iter.set(dbfull()->NewInternalIterator(read_options, &arena, &range_del_agg,
|
|
|
|
kMaxSequenceNumber, handles_[cf]));
|
|
|
|
}
|
|
|
|
InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
|
|
|
|
iter->Seek(target.Encode());
|
|
|
|
std::string result;
|
|
|
|
if (!iter->status().ok()) {
|
|
|
|
result = iter->status().ToString();
|
|
|
|
} else {
|
|
|
|
result = "[ ";
|
|
|
|
bool first = true;
|
|
|
|
while (iter->Valid()) {
|
|
|
|
ParsedInternalKey ikey(Slice(), 0, kTypeValue);
|
|
|
|
if (ParseInternalKey(iter->key(), &ikey, true /* log_err_key */) !=
|
|
|
|
Status::OK()) {
|
|
|
|
result += "CORRUPTED";
|
|
|
|
} else {
|
|
|
|
if (!last_options_.comparator->Equal(ikey.user_key, user_key)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!first) {
|
|
|
|
result += ", ";
|
|
|
|
}
|
|
|
|
first = false;
|
|
|
|
switch (ikey.type) {
|
|
|
|
case kTypeValue:
|
|
|
|
result += iter->value().ToString();
|
|
|
|
break;
|
|
|
|
case kTypeMerge:
|
|
|
|
// keep it the same as kTypeValue for testing kMergePut
|
|
|
|
result += iter->value().ToString();
|
|
|
|
break;
|
|
|
|
case kTypeDeletion:
|
|
|
|
result += "DEL";
|
|
|
|
break;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
case kTypeSingleDeletion:
|
|
|
|
result += "SDEL";
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
if (!first) {
|
|
|
|
result += " ";
|
|
|
|
}
|
|
|
|
result += "]";
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
int DBTestBase::NumSortedRuns(int cf) {
|
|
|
|
ColumnFamilyMetaData cf_meta;
|
|
|
|
if (cf == 0) {
|
|
|
|
db_->GetColumnFamilyMetaData(&cf_meta);
|
|
|
|
} else {
|
|
|
|
db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
|
|
|
|
}
|
|
|
|
int num_sr = static_cast<int>(cf_meta.levels[0].files.size());
|
|
|
|
for (size_t i = 1U; i < cf_meta.levels.size(); i++) {
|
|
|
|
if (cf_meta.levels[i].files.size() > 0) {
|
|
|
|
num_sr++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return num_sr;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t DBTestBase::TotalSize(int cf) {
|
|
|
|
ColumnFamilyMetaData cf_meta;
|
|
|
|
if (cf == 0) {
|
|
|
|
db_->GetColumnFamilyMetaData(&cf_meta);
|
|
|
|
} else {
|
|
|
|
db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
|
|
|
|
}
|
|
|
|
return cf_meta.size;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t DBTestBase::SizeAtLevel(int level) {
|
|
|
|
std::vector<LiveFileMetaData> metadata;
|
|
|
|
db_->GetLiveFilesMetaData(&metadata);
|
|
|
|
uint64_t sum = 0;
|
|
|
|
for (const auto& m : metadata) {
|
|
|
|
if (m.level == level) {
|
|
|
|
sum += m.size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return sum;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t DBTestBase::TotalLiveFiles(int cf) {
|
|
|
|
ColumnFamilyMetaData cf_meta;
|
|
|
|
if (cf == 0) {
|
|
|
|
db_->GetColumnFamilyMetaData(&cf_meta);
|
|
|
|
} else {
|
|
|
|
db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
|
|
|
|
}
|
|
|
|
size_t num_files = 0;
|
|
|
|
for (auto& level : cf_meta.levels) {
|
|
|
|
num_files += level.files.size();
|
|
|
|
}
|
|
|
|
return num_files;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t DBTestBase::CountLiveFiles() {
|
|
|
|
std::vector<LiveFileMetaData> metadata;
|
|
|
|
db_->GetLiveFilesMetaData(&metadata);
|
|
|
|
return metadata.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
int DBTestBase::NumTableFilesAtLevel(int level, int cf) {
|
|
|
|
std::string property;
|
|
|
|
if (cf == 0) {
|
|
|
|
// default cfd
|
|
|
|
EXPECT_TRUE(db_->GetProperty(
|
|
|
|
"rocksdb.num-files-at-level" + std::to_string(level), &property));
|
|
|
|
} else {
|
|
|
|
EXPECT_TRUE(db_->GetProperty(
|
|
|
|
handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level),
|
|
|
|
&property));
|
|
|
|
}
|
|
|
|
return atoi(property.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
double DBTestBase::CompressionRatioAtLevel(int level, int cf) {
|
|
|
|
std::string property;
|
|
|
|
if (cf == 0) {
|
|
|
|
// default cfd
|
|
|
|
EXPECT_TRUE(db_->GetProperty(
|
|
|
|
"rocksdb.compression-ratio-at-level" + std::to_string(level),
|
|
|
|
&property));
|
|
|
|
} else {
|
|
|
|
EXPECT_TRUE(db_->GetProperty(
|
|
|
|
handles_[cf],
|
|
|
|
"rocksdb.compression-ratio-at-level" + std::to_string(level),
|
|
|
|
&property));
|
|
|
|
}
|
|
|
|
return std::stod(property);
|
|
|
|
}
|
|
|
|
|
|
|
|
int DBTestBase::TotalTableFiles(int cf, int levels) {
|
|
|
|
if (levels == -1) {
|
|
|
|
levels = (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
|
|
|
|
}
|
|
|
|
int result = 0;
|
|
|
|
for (int level = 0; level < levels; level++) {
|
|
|
|
result += NumTableFilesAtLevel(level, cf);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return spread of files per level
|
|
|
|
std::string DBTestBase::FilesPerLevel(int cf) {
|
|
|
|
int num_levels =
|
|
|
|
(cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
|
|
|
|
std::string result;
|
|
|
|
size_t last_non_zero_offset = 0;
|
|
|
|
for (int level = 0; level < num_levels; level++) {
|
|
|
|
int f = NumTableFilesAtLevel(level, cf);
|
|
|
|
char buf[100];
|
|
|
|
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
|
|
|
|
result += buf;
|
|
|
|
if (f > 0) {
|
|
|
|
last_non_zero_offset = result.size();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
result.resize(last_non_zero_offset);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
|
|
|
|
std::vector<uint64_t> DBTestBase::GetBlobFileNumbers() {
|
|
|
|
VersionSet* const versions = dbfull()->GetVersionSet();
|
|
|
|
assert(versions);
|
|
|
|
|
|
|
|
ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
|
|
|
|
assert(cfd);
|
|
|
|
|
|
|
|
Version* const current = cfd->current();
|
|
|
|
assert(current);
|
|
|
|
|
|
|
|
const VersionStorageInfo* const storage_info = current->storage_info();
|
|
|
|
assert(storage_info);
|
|
|
|
|
|
|
|
const auto& blob_files = storage_info->GetBlobFiles();
|
|
|
|
|
|
|
|
std::vector<uint64_t> result;
|
|
|
|
result.reserve(blob_files.size());
|
|
|
|
|
|
|
|
for (const auto& blob_file : blob_files) {
|
Use a sorted vector instead of a map to store blob file metadata (#9526)
Summary:
The patch replaces `std::map` with a sorted `std::vector` for
`VersionStorageInfo::blob_files_` and preallocates the space
for the `vector` before saving the `BlobFileMetaData` into the
new `VersionStorageInfo` in `VersionBuilder::Rep::SaveBlobFilesTo`.
These changes reduce the time the DB mutex is held while
saving new `Version`s, and using a sorted `vector` also makes
lookups faster thanks to better memory locality.
In addition, the patch introduces helper methods
`VersionStorageInfo::GetBlobFileMetaData` and
`VersionStorageInfo::GetBlobFileMetaDataLB` that can be used by
clients to perform lookups in the `vector`, and does some general
cleanup in the parts of code where blob file metadata are used.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9526
Test Plan:
Ran `make check` and the crash test script for a while.
Performance was tested using a load-optimized benchmark (`fillseq` with vector memtable, no WAL) and small file sizes so that a significant number of files are produced:
```
numactl --interleave=all ./db_bench --benchmarks=fillseq --allow_concurrent_memtable_write=false --level0_file_num_compaction_trigger=4 --level0_slowdown_writes_trigger=20 --level0_stop_writes_trigger=30 --max_background_jobs=8 --max_write_buffer_number=8 --db=/data/ltamasi-dbbench --wal_dir=/data/ltamasi-dbbench --num=800000000 --num_levels=8 --key_size=20 --value_size=400 --block_size=8192 --cache_size=51539607552 --cache_numshardbits=6 --compression_max_dict_bytes=0 --compression_ratio=0.5 --compression_type=lz4 --bytes_per_sync=8388608 --cache_index_and_filter_blocks=1 --cache_high_pri_pool_ratio=0.5 --benchmark_write_rate_limit=0 --write_buffer_size=16777216 --target_file_size_base=16777216 --max_bytes_for_level_base=67108864 --verify_checksum=1 --delete_obsolete_files_period_micros=62914560 --max_bytes_for_level_multiplier=8 --statistics=0 --stats_per_interval=1 --stats_interval_seconds=20 --histogram=1 --memtablerep=skip_list --bloom_bits=10 --open_files=-1 --subcompactions=1 --compaction_style=0 --min_level_to_compress=3 --level_compaction_dynamic_level_bytes=true --pin_l0_filter_and_index_blocks_in_cache=1 --soft_pending_compaction_bytes_limit=167503724544 --hard_pending_compaction_bytes_limit=335007449088 --min_level_to_compress=0 --use_existing_db=0 --sync=0 --threads=1 --memtablerep=vector --allow_concurrent_memtable_write=false --disable_wal=1 --enable_blob_files=1 --blob_file_size=16777216 --min_blob_size=0 --blob_compression_type=lz4 --enable_blob_garbage_collection=1 --seed=<some value>
```
Final statistics before the patch:
```
Cumulative writes: 0 writes, 700M keys, 0 commit groups, 0.0 writes per commit group, ingest: 284.62 GB, 121.27 MB/s
Interval writes: 0 writes, 334K keys, 0 commit groups, 0.0 writes per commit group, ingest: 139.28 MB, 72.46 MB/s
```
With the patch:
```
Cumulative writes: 0 writes, 760M keys, 0 commit groups, 0.0 writes per commit group, ingest: 308.66 GB, 131.52 MB/s
Interval writes: 0 writes, 445K keys, 0 commit groups, 0.0 writes per commit group, ingest: 185.35 MB, 93.15 MB/s
```
Total time to complete the benchmark is 2611 seconds with the patch, down from 2986 secs.
Reviewed By: riversand963
Differential Revision: D34082728
Pulled By: ltamasi
fbshipit-source-id: fc598abf676dce436734d06bb9d2d99a26a004fc
3 years ago
|
|
|
assert(blob_file);
|
|
|
|
result.emplace_back(blob_file->GetBlobFileNumber());
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t DBTestBase::CountFiles() {
|
|
|
|
size_t count = 0;
|
|
|
|
std::vector<std::string> files;
|
|
|
|
if (env_->GetChildren(dbname_, &files).ok()) {
|
|
|
|
count += files.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dbname_ != last_options_.wal_dir) {
|
|
|
|
if (env_->GetChildren(last_options_.wal_dir, &files).ok()) {
|
|
|
|
count += files.size();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return count;
|
|
|
|
};
|
|
|
|
|
|
|
|
Status DBTestBase::CountFiles(size_t* count) {
|
|
|
|
std::vector<std::string> files;
|
|
|
|
Status s = env_->GetChildren(dbname_, &files);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
size_t files_count = files.size();
|
|
|
|
|
|
|
|
if (dbname_ != last_options_.wal_dir) {
|
|
|
|
s = env_->GetChildren(last_options_.wal_dir, &files);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
*count = files_count + files.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::Size(const Slice& start, const Slice& limit, int cf,
|
|
|
|
uint64_t* size) {
|
|
|
|
Range r(start, limit);
|
|
|
|
if (cf == 0) {
|
|
|
|
return db_->GetApproximateSizes(&r, 1, size);
|
|
|
|
} else {
|
|
|
|
return db_->GetApproximateSizes(handles_[1], &r, 1, size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit,
|
|
|
|
uint32_t target_path_id) {
|
|
|
|
CompactRangeOptions compact_options;
|
|
|
|
compact_options.target_path_id = target_path_id;
|
|
|
|
ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit) {
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::Compact(const Slice& start, const Slice& limit) {
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Do n memtable compactions, each of which produces an sstable
|
|
|
|
// covering the range [small,large].
|
|
|
|
void DBTestBase::MakeTables(int n, const std::string& small,
|
|
|
|
const std::string& large, int cf) {
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
|
|
ASSERT_OK(Put(cf, small, "begin"));
|
|
|
|
ASSERT_OK(Put(cf, large, "end"));
|
|
|
|
ASSERT_OK(Flush(cf));
|
|
|
|
MoveFilesToLevel(n - i - 1, cf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Prevent pushing of new sstables into deeper levels by adding
|
|
|
|
// tables that cover a specified range to all levels.
|
|
|
|
void DBTestBase::FillLevels(const std::string& smallest,
|
|
|
|
const std::string& largest, int cf) {
|
|
|
|
MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::MoveFilesToLevel(int level, int cf) {
|
|
|
|
for (int l = 0; l < level; ++l) {
|
|
|
|
if (cf > 0) {
|
|
|
|
EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf]));
|
|
|
|
} else {
|
|
|
|
EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
void DBTestBase::DumpFileCounts(const char* label) {
|
|
|
|
fprintf(stderr, "---\n%s:\n", label);
|
|
|
|
fprintf(stderr, "maxoverlap: %" PRIu64 "\n",
|
|
|
|
dbfull()->TEST_MaxNextLevelOverlappingBytes());
|
|
|
|
for (int level = 0; level < db_->NumberLevels(); level++) {
|
|
|
|
int num = NumTableFilesAtLevel(level);
|
|
|
|
if (num > 0) {
|
|
|
|
fprintf(stderr, " level %3d : %d files\n", level, num);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
|
|
|
|
std::string DBTestBase::DumpSSTableList() {
|
|
|
|
std::string property;
|
|
|
|
db_->GetProperty("rocksdb.sstables", &property);
|
|
|
|
return property;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::GetSstFiles(Env* env, std::string path,
|
|
|
|
std::vector<std::string>* files) {
|
|
|
|
EXPECT_OK(env->GetChildren(path, files));
|
|
|
|
|
|
|
|
files->erase(
|
|
|
|
std::remove_if(files->begin(), files->end(), [](std::string name) {
|
|
|
|
uint64_t number;
|
|
|
|
FileType type;
|
|
|
|
return !(ParseFileName(name, &number, &type) && type == kTableFile);
|
|
|
|
}), files->end());
|
|
|
|
}
|
|
|
|
|
|
|
|
int DBTestBase::GetSstFileCount(std::string path) {
|
|
|
|
std::vector<std::string> files;
|
|
|
|
DBTestBase::GetSstFiles(env_, path, &files);
|
|
|
|
return static_cast<int>(files.size());
|
|
|
|
}
|
|
|
|
|
|
|
|
// this will generate non-overlapping files since it keeps increasing key_idx
|
|
|
|
void DBTestBase::GenerateNewFile(int cf, Random* rnd, int* key_idx,
|
|
|
|
bool nowait) {
|
|
|
|
for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
|
|
|
|
ASSERT_OK(Put(cf, Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990)));
|
|
|
|
(*key_idx)++;
|
|
|
|
}
|
|
|
|
if (!nowait) {
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// this will generate non-overlapping files since it keeps increasing key_idx
|
|
|
|
void DBTestBase::GenerateNewFile(Random* rnd, int* key_idx, bool nowait) {
|
|
|
|
for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
|
|
|
|
ASSERT_OK(Put(Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990)));
|
|
|
|
(*key_idx)++;
|
|
|
|
}
|
|
|
|
if (!nowait) {
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const int DBTestBase::kNumKeysByGenerateNewRandomFile = 51;
|
|
|
|
|
|
|
|
void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) {
|
|
|
|
for (int i = 0; i < kNumKeysByGenerateNewRandomFile; i++) {
|
|
|
|
ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(2000)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(200)));
|
|
|
|
if (!nowait) {
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string DBTestBase::IterStatus(Iterator* iter) {
|
|
|
|
std::string result;
|
|
|
|
if (iter->Valid()) {
|
|
|
|
result = iter->key().ToString() + "->" + iter->value().ToString();
|
|
|
|
} else {
|
|
|
|
result = "(invalid)";
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
Options DBTestBase::OptionsForLogIterTest() {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.WAL_ttl_seconds = 1000;
|
|
|
|
return options;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string DBTestBase::DummyString(size_t len, char c) {
|
|
|
|
return std::string(len, c);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::VerifyIterLast(std::string expected_key, int cf) {
|
|
|
|
Iterator* iter;
|
|
|
|
ReadOptions ro;
|
|
|
|
if (cf == 0) {
|
|
|
|
iter = db_->NewIterator(ro);
|
|
|
|
} else {
|
|
|
|
iter = db_->NewIterator(ro, handles_[cf]);
|
|
|
|
}
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_EQ(IterStatus(iter), expected_key);
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Used to test InplaceUpdate
|
|
|
|
|
|
|
|
// If previous value is nullptr or delta is > than previous value,
|
|
|
|
// sets newValue with delta
|
|
|
|
// If previous value is not empty,
|
|
|
|
// updates previous value with 'b' string of previous value size - 1.
|
|
|
|
UpdateStatus DBTestBase::updateInPlaceSmallerSize(char* prevValue,
|
|
|
|
uint32_t* prevSize,
|
|
|
|
Slice delta,
|
|
|
|
std::string* newValue) {
|
|
|
|
if (prevValue == nullptr) {
|
|
|
|
*newValue = std::string(delta.size(), 'c');
|
|
|
|
return UpdateStatus::UPDATED;
|
|
|
|
} else {
|
|
|
|
*prevSize = *prevSize - 1;
|
|
|
|
std::string str_b = std::string(*prevSize, 'b');
|
|
|
|
memcpy(prevValue, str_b.c_str(), str_b.size());
|
|
|
|
return UpdateStatus::UPDATED_INPLACE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize(char* prevValue,
|
|
|
|
uint32_t* prevSize,
|
|
|
|
Slice delta,
|
|
|
|
std::string* newValue) {
|
|
|
|
if (prevValue == nullptr) {
|
|
|
|
*newValue = std::string(delta.size(), 'c');
|
|
|
|
return UpdateStatus::UPDATED;
|
|
|
|
} else {
|
|
|
|
*prevSize = 1;
|
|
|
|
std::string str_b = std::string(*prevSize, 'b');
|
|
|
|
memcpy(prevValue, str_b.c_str(), str_b.size());
|
|
|
|
return UpdateStatus::UPDATED_INPLACE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UpdateStatus DBTestBase::updateInPlaceLargerSize(char* /*prevValue*/,
|
|
|
|
uint32_t* /*prevSize*/,
|
|
|
|
Slice delta,
|
|
|
|
std::string* newValue) {
|
|
|
|
*newValue = std::string(delta.size(), 'c');
|
|
|
|
return UpdateStatus::UPDATED;
|
|
|
|
}
|
|
|
|
|
|
|
|
UpdateStatus DBTestBase::updateInPlaceNoAction(char* /*prevValue*/,
|
|
|
|
uint32_t* /*prevSize*/,
|
|
|
|
Slice /*delta*/,
|
|
|
|
std::string* /*newValue*/) {
|
|
|
|
return UpdateStatus::UPDATE_FAILED;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Utility method to test InplaceUpdate
|
|
|
|
void DBTestBase::validateNumberOfEntries(int numValues, int cf) {
|
|
|
|
Arena arena;
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
InternalKeyComparator icmp(options.comparator);
|
|
|
|
ReadRangeDelAggregator range_del_agg(&icmp,
|
|
|
|
kMaxSequenceNumber /* upper_bound */);
|
|
|
|
// This should be defined after range_del_agg so that it destructs the
|
|
|
|
// assigned iterator before it range_del_agg is already destructed.
|
|
|
|
ReadOptions read_options;
|
|
|
|
ScopedArenaIterator iter;
|
|
|
|
if (cf != 0) {
|
|
|
|
iter.set(dbfull()->NewInternalIterator(read_options, &arena, &range_del_agg,
|
|
|
|
kMaxSequenceNumber, handles_[cf]));
|
|
|
|
} else {
|
|
|
|
iter.set(dbfull()->NewInternalIterator(read_options, &arena, &range_del_agg,
|
|
|
|
kMaxSequenceNumber));
|
|
|
|
}
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
int seq = numValues;
|
|
|
|
while (iter->Valid()) {
|
|
|
|
ParsedInternalKey ikey;
|
|
|
|
ikey.clear();
|
|
|
|
ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
|
|
|
|
|
|
|
|
// checks sequence number for updates
|
|
|
|
ASSERT_EQ(ikey.sequence, (unsigned)seq--);
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
ASSERT_EQ(0, seq);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::CopyFile(const std::string& source,
|
|
|
|
const std::string& destination, uint64_t size) {
|
|
|
|
const EnvOptions soptions;
|
|
|
|
std::unique_ptr<SequentialFile> srcfile;
|
|
|
|
ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
|
|
|
|
std::unique_ptr<WritableFile> destfile;
|
|
|
|
ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
|
|
|
|
|
|
|
|
if (size == 0) {
|
|
|
|
// default argument means copy everything
|
|
|
|
ASSERT_OK(env_->GetFileSize(source, &size));
|
|
|
|
}
|
|
|
|
|
|
|
|
char buffer[4096];
|
|
|
|
Slice slice;
|
|
|
|
while (size > 0) {
|
|
|
|
uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
|
|
|
|
ASSERT_OK(srcfile->Read(one, &slice, buffer));
|
|
|
|
ASSERT_OK(destfile->Append(slice));
|
|
|
|
size -= slice.size();
|
|
|
|
}
|
|
|
|
ASSERT_OK(destfile->Close());
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBTestBase::GetAllDataFiles(
|
|
|
|
const FileType file_type, std::unordered_map<std::string, uint64_t>* files,
|
|
|
|
uint64_t* total_size /* = nullptr */) {
|
|
|
|
if (total_size) {
|
|
|
|
*total_size = 0;
|
|
|
|
}
|
|
|
|
std::vector<std::string> children;
|
|
|
|
Status s = env_->GetChildren(dbname_, &children);
|
|
|
|
if (s.ok()) {
|
|
|
|
for (auto& file_name : children) {
|
|
|
|
uint64_t number;
|
|
|
|
FileType type;
|
|
|
|
if (ParseFileName(file_name, &number, &type) && type == file_type) {
|
|
|
|
std::string file_path = dbname_ + "/" + file_name;
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
s = env_->GetFileSize(file_path, &file_size);
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
(*files)[file_path] = file_size;
|
|
|
|
if (total_size) {
|
|
|
|
*total_size += file_size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::uint64_t> DBTestBase::ListTableFiles(Env* env,
|
|
|
|
const std::string& path) {
|
|
|
|
std::vector<std::string> files;
|
|
|
|
std::vector<uint64_t> file_numbers;
|
|
|
|
EXPECT_OK(env->GetChildren(path, &files));
|
|
|
|
uint64_t number;
|
|
|
|
FileType type;
|
|
|
|
for (size_t i = 0; i < files.size(); ++i) {
|
|
|
|
if (ParseFileName(files[i], &number, &type)) {
|
|
|
|
if (type == kTableFile) {
|
|
|
|
file_numbers.push_back(number);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return file_numbers;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::VerifyDBFromMap(std::map<std::string, std::string> true_data,
|
|
|
|
size_t* total_reads_res, bool tailing_iter,
|
|
|
|
std::map<std::string, Status> status) {
|
|
|
|
size_t total_reads = 0;
|
|
|
|
|
Introduce FullMergeV2 (eliminate memcpy from merge operators)
Summary:
This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice>
This diff is stacked on top of D56493 and D56511
In this diff we
- Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future
- Replace std::deque<std::string> with std::vector<Slice> to pass operands
- Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187)
- Allow FullMergeV2 output to be an existing operand
```
[Everything in Memtable | 10K operands | 10 KB each | 1 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s
readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s
readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s
readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s
readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s
[master]
readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s
readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s
readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s
readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s
readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s
```
```
[Everything in Memtable | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s
readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s
readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s
readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s
readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s
[master]
readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s
readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s
readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s
readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s
readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 1 operand per key]
[FullMergeV2]
$ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s
readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s
readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s
readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s
readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s
[master]
readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s
readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s
readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s
readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s
readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
[FullMergeV2]
readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s
readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s
readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s
readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s
readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s
[master]
readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s
readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s
readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s
readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s
readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s
```
Test Plan: COMPILE_WITH_ASAN=1 make check -j64
Reviewers: yhchiang, andrewkr, sdong
Reviewed By: sdong
Subscribers: lovro, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D57075
8 years ago
|
|
|
for (auto& kv : true_data) {
|
|
|
|
Status s = status[kv.first];
|
|
|
|
if (s.ok()) {
|
|
|
|
ASSERT_EQ(Get(kv.first), kv.second);
|
|
|
|
} else {
|
|
|
|
std::string value;
|
|
|
|
ASSERT_EQ(s, db_->Get(ReadOptions(), kv.first, &value));
|
|
|
|
}
|
|
|
|
total_reads++;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Normal Iterator
|
|
|
|
{
|
|
|
|
int iter_cnt = 0;
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
|
|
|
Iterator* iter = db_->NewIterator(ro);
|
|
|
|
// Verify Iterator::Next()
|
|
|
|
iter_cnt = 0;
|
|
|
|
auto data_iter = true_data.begin();
|
|
|
|
Status s;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) {
|
|
|
|
ASSERT_EQ(iter->key().ToString(), data_iter->first);
|
|
|
|
Status current_status = status[data_iter->first];
|
|
|
|
if (!current_status.ok()) {
|
|
|
|
s = current_status;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(iter->status(), s);
|
|
|
|
if (current_status.ok()) {
|
|
|
|
ASSERT_EQ(iter->value().ToString(), data_iter->second);
|
|
|
|
}
|
|
|
|
iter_cnt++;
|
|
|
|
total_reads++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(data_iter, true_data.end()) << iter_cnt << " / "
|
|
|
|
<< true_data.size();
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
// Verify Iterator::Prev()
|
|
|
|
// Use a new iterator to make sure its status is clean.
|
|
|
|
iter = db_->NewIterator(ro);
|
|
|
|
iter_cnt = 0;
|
|
|
|
s = Status::OK();
|
|
|
|
auto data_rev = true_data.rbegin();
|
|
|
|
for (iter->SeekToLast(); iter->Valid(); iter->Prev(), data_rev++) {
|
|
|
|
ASSERT_EQ(iter->key().ToString(), data_rev->first);
|
|
|
|
Status current_status = status[data_rev->first];
|
|
|
|
if (!current_status.ok()) {
|
|
|
|
s = current_status;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(iter->status(), s);
|
|
|
|
if (current_status.ok()) {
|
|
|
|
ASSERT_EQ(iter->value().ToString(), data_rev->second);
|
|
|
|
}
|
|
|
|
iter_cnt++;
|
|
|
|
total_reads++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(data_rev, true_data.rend()) << iter_cnt << " / "
|
|
|
|
<< true_data.size();
|
|
|
|
|
|
|
|
// Verify Iterator::Seek()
|
|
|
|
for (auto kv : true_data) {
|
|
|
|
iter->Seek(kv.first);
|
|
|
|
ASSERT_EQ(kv.first, iter->key().ToString());
|
|
|
|
ASSERT_EQ(kv.second, iter->value().ToString());
|
|
|
|
total_reads++;
|
|
|
|
}
|
|
|
|
delete iter;
|
Introduce FullMergeV2 (eliminate memcpy from merge operators)
Summary:
This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice>
This diff is stacked on top of D56493 and D56511
In this diff we
- Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future
- Replace std::deque<std::string> with std::vector<Slice> to pass operands
- Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187)
- Allow FullMergeV2 output to be an existing operand
```
[Everything in Memtable | 10K operands | 10 KB each | 1 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s
readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s
readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s
readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s
readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s
[master]
readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s
readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s
readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s
readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s
readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s
```
```
[Everything in Memtable | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s
readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s
readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s
readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s
readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s
[master]
readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s
readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s
readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s
readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s
readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 1 operand per key]
[FullMergeV2]
$ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s
readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s
readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s
readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s
readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s
[master]
readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s
readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s
readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s
readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s
readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
[FullMergeV2]
readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s
readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s
readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s
readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s
readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s
[master]
readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s
readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s
readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s
readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s
readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s
```
Test Plan: COMPILE_WITH_ASAN=1 make check -j64
Reviewers: yhchiang, andrewkr, sdong
Reviewed By: sdong
Subscribers: lovro, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D57075
8 years ago
|
|
|
}
|
|
|
|
|
|
|
|
if (tailing_iter) {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
// Tailing iterator
|
|
|
|
int iter_cnt = 0;
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.tailing = true;
|
|
|
|
ro.total_order_seek = true;
|
|
|
|
Iterator* iter = db_->NewIterator(ro);
|
Introduce FullMergeV2 (eliminate memcpy from merge operators)
Summary:
This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice>
This diff is stacked on top of D56493 and D56511
In this diff we
- Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future
- Replace std::deque<std::string> with std::vector<Slice> to pass operands
- Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187)
- Allow FullMergeV2 output to be an existing operand
```
[Everything in Memtable | 10K operands | 10 KB each | 1 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s
readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s
readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s
readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s
readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s
[master]
readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s
readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s
readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s
readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s
readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s
```
```
[Everything in Memtable | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s
readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s
readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s
readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s
readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s
[master]
readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s
readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s
readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s
readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s
readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 1 operand per key]
[FullMergeV2]
$ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s
readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s
readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s
readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s
readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s
[master]
readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s
readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s
readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s
readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s
readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
[FullMergeV2]
readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s
readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s
readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s
readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s
readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s
[master]
readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s
readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s
readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s
readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s
readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s
```
Test Plan: COMPILE_WITH_ASAN=1 make check -j64
Reviewers: yhchiang, andrewkr, sdong
Reviewed By: sdong
Subscribers: lovro, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D57075
8 years ago
|
|
|
|
|
|
|
// Verify ForwardIterator::Next()
|
|
|
|
iter_cnt = 0;
|
|
|
|
auto data_iter = true_data.begin();
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) {
|
|
|
|
ASSERT_EQ(iter->key().ToString(), data_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), data_iter->second);
|
|
|
|
iter_cnt++;
|
|
|
|
total_reads++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(data_iter, true_data.end()) << iter_cnt << " / "
|
|
|
|
<< true_data.size();
|
|
|
|
|
|
|
|
// Verify ForwardIterator::Seek()
|
|
|
|
for (auto kv : true_data) {
|
|
|
|
iter->Seek(kv.first);
|
|
|
|
ASSERT_EQ(kv.first, iter->key().ToString());
|
|
|
|
ASSERT_EQ(kv.second, iter->value().ToString());
|
|
|
|
total_reads++;
|
|
|
|
}
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
}
|
|
|
|
|
|
|
|
if (total_reads_res) {
|
|
|
|
*total_reads_res = total_reads;
|
|
|
|
}
|
Introduce FullMergeV2 (eliminate memcpy from merge operators)
Summary:
This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice>
This diff is stacked on top of D56493 and D56511
In this diff we
- Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future
- Replace std::deque<std::string> with std::vector<Slice> to pass operands
- Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187)
- Allow FullMergeV2 output to be an existing operand
```
[Everything in Memtable | 10K operands | 10 KB each | 1 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s
readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s
readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s
readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s
readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s
[master]
readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s
readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s
readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s
readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s
readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s
```
```
[Everything in Memtable | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s
readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s
readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s
readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s
readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s
[master]
readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s
readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s
readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s
readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s
readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 1 operand per key]
[FullMergeV2]
$ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s
readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s
readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s
readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s
readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s
[master]
readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s
readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s
readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s
readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s
readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
[FullMergeV2]
readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s
readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s
readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s
readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s
readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s
[master]
readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s
readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s
readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s
readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s
readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s
```
Test Plan: COMPILE_WITH_ASAN=1 make check -j64
Reviewers: yhchiang, andrewkr, sdong
Reviewed By: sdong
Subscribers: lovro, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D57075
8 years ago
|
|
|
}
|
|
|
|
|
|
|
|
void DBTestBase::VerifyDBInternal(
|
|
|
|
std::vector<std::pair<std::string, std::string>> true_data) {
|
|
|
|
Arena arena;
|
|
|
|
InternalKeyComparator icmp(last_options_.comparator);
|
|
|
|
ReadRangeDelAggregator range_del_agg(&icmp,
|
|
|
|
kMaxSequenceNumber /* upper_bound */);
|
|
|
|
ReadOptions read_options;
|
|
|
|
auto iter = dbfull()->NewInternalIterator(read_options, &arena,
|
|
|
|
&range_del_agg, kMaxSequenceNumber);
|
|
|
|
iter->SeekToFirst();
|
|
|
|
for (auto p : true_data) {
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ParsedInternalKey ikey;
|
|
|
|
ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
|
|
|
|
ASSERT_EQ(p.first, ikey.user_key);
|
|
|
|
ASSERT_EQ(p.second, iter->value());
|
|
|
|
iter->Next();
|
|
|
|
};
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
iter->~InternalIterator();
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
|
|
|
|
uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily(
|
|
|
|
DB* db, std::string column_family_name) {
|
|
|
|
std::vector<LiveFileMetaData> metadata;
|
|
|
|
db->GetLiveFilesMetaData(&metadata);
|
|
|
|
uint64_t result = 0;
|
|
|
|
for (auto& fileMetadata : metadata) {
|
|
|
|
result += (fileMetadata.column_family_name == column_family_name);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
3 years ago
|
|
|
void VerifySstUniqueIds(const TablePropertiesCollection& props) {
|
|
|
|
ASSERT_FALSE(props.empty()); // suspicious test if empty
|
|
|
|
std::unordered_set<std::string> seen;
|
|
|
|
for (auto& pair : props) {
|
|
|
|
std::string id;
|
|
|
|
ASSERT_OK(GetUniqueIdFromTableProperties(*pair.second, &id));
|
|
|
|
ASSERT_TRUE(seen.insert(id).second);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template <CacheEntryRole R>
|
|
|
|
TargetCacheChargeTrackingCache<R>::TargetCacheChargeTrackingCache(
|
|
|
|
std::shared_ptr<Cache> target)
|
|
|
|
: CacheWrapper(std::move(target)),
|
|
|
|
cur_cache_charge_(0),
|
|
|
|
cache_charge_peak_(0),
|
|
|
|
cache_charge_increment_(0),
|
|
|
|
last_peak_tracked_(false),
|
|
|
|
cache_charge_increments_sum_(0) {}
|
|
|
|
|
|
|
|
template <CacheEntryRole R>
|
|
|
|
Status TargetCacheChargeTrackingCache<R>::Insert(
|
|
|
|
const Slice& key, void* value, size_t charge,
|
|
|
|
void (*deleter)(const Slice& key, void* value), Handle** handle,
|
|
|
|
Priority priority) {
|
|
|
|
Status s = target_->Insert(key, value, charge, deleter, handle, priority);
|
|
|
|
if (deleter == kNoopDeleter) {
|
|
|
|
if (last_peak_tracked_) {
|
|
|
|
cache_charge_peak_ = 0;
|
|
|
|
cache_charge_increment_ = 0;
|
|
|
|
last_peak_tracked_ = false;
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
cur_cache_charge_ += charge;
|
|
|
|
}
|
|
|
|
cache_charge_peak_ = std::max(cache_charge_peak_, cur_cache_charge_);
|
|
|
|
cache_charge_increment_ += charge;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <CacheEntryRole R>
|
|
|
|
bool TargetCacheChargeTrackingCache<R>::Release(Handle* handle,
|
|
|
|
bool erase_if_last_ref) {
|
|
|
|
auto deleter = GetDeleter(handle);
|
|
|
|
if (deleter == kNoopDeleter) {
|
|
|
|
if (!last_peak_tracked_) {
|
|
|
|
cache_charge_peaks_.push_back(cache_charge_peak_);
|
|
|
|
cache_charge_increments_sum_ += cache_charge_increment_;
|
|
|
|
last_peak_tracked_ = true;
|
|
|
|
}
|
|
|
|
cur_cache_charge_ -= GetCharge(handle);
|
|
|
|
}
|
|
|
|
bool is_successful = target_->Release(handle, erase_if_last_ref);
|
|
|
|
return is_successful;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <CacheEntryRole R>
|
|
|
|
const Cache::DeleterFn TargetCacheChargeTrackingCache<R>::kNoopDeleter =
|
|
|
|
CacheReservationManagerImpl<R>::TEST_GetNoopDeleterForRole();
|
|
|
|
|
|
|
|
template class TargetCacheChargeTrackingCache<
|
|
|
|
CacheEntryRole::kFilterConstruction>;
|
|
|
|
template class TargetCacheChargeTrackingCache<
|
|
|
|
CacheEntryRole::kBlockBasedTableReader>;
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|