|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
//
|
|
|
|
// The test uses an array to compare against values written to the database.
|
|
|
|
// Keys written to the array are in 1:1 correspondence to the actual values in
|
|
|
|
// the database according to the formula in the function GenerateValue.
|
|
|
|
|
|
|
|
// Space is reserved in the array from 0 to FLAGS_max_key and values are
|
|
|
|
// randomly written/deleted/read from those positions. During verification we
|
|
|
|
// compare all the positions in the array. To shorten/elongate the running
|
|
|
|
// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread,
|
|
|
|
// (sometimes also FLAGS_threads).
|
|
|
|
//
|
|
|
|
// NOTE that if FLAGS_test_batches_snapshots is set, the test will have
|
|
|
|
// different behavior. See comment of the flag for details.
|
|
|
|
|
|
|
|
#ifdef GFLAGS
|
|
|
|
#include "db_stress_tool/db_stress_common.h"
|
|
|
|
#include "db_stress_tool/db_stress_driver.h"
|
|
|
|
#include "rocksdb/convenience.h"
|
|
|
|
#include "utilities/fault_injection_fs.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
namespace {
|
|
|
|
static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
|
|
|
|
static std::shared_ptr<ROCKSDB_NAMESPACE::DbStressEnvWrapper> env_wrapper_guard;
|
|
|
|
static std::shared_ptr<ROCKSDB_NAMESPACE::DbStressEnvWrapper>
|
|
|
|
dbsl_env_wrapper_guard;
|
|
|
|
static std::shared_ptr<CompositeEnvWrapper> fault_env_guard;
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
KeyGenContext key_gen_ctx;
|
|
|
|
|
|
|
|
int db_stress_tool(int argc, char** argv) {
|
|
|
|
SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
|
|
|
|
" [OPTIONS]...");
|
|
|
|
ParseCommandLineFlags(&argc, &argv, true);
|
|
|
|
|
|
|
|
SanitizeDoubleParam(&FLAGS_bloom_bits);
|
|
|
|
SanitizeDoubleParam(&FLAGS_memtable_prefix_bloom_size_ratio);
|
|
|
|
SanitizeDoubleParam(&FLAGS_max_bytes_for_level_multiplier);
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
if (FLAGS_mock_direct_io) {
|
|
|
|
SetupSyncPointsToMockDirectIO();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
if (FLAGS_statistics) {
|
|
|
|
dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
if (FLAGS_test_secondary) {
|
|
|
|
dbstats_secondaries = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
compression_type_e = StringToCompressionType(FLAGS_compression_type.c_str());
|
|
|
|
bottommost_compression_type_e =
|
|
|
|
StringToCompressionType(FLAGS_bottommost_compression_type.c_str());
|
|
|
|
checksum_type_e = StringToChecksumType(FLAGS_checksum_type.c_str());
|
|
|
|
|
|
|
|
Env* raw_env;
|
|
|
|
|
|
|
|
int env_opts = !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
|
|
|
|
if (env_opts > 1) {
|
|
|
|
fprintf(stderr, "Error: --env_uri and --fs_uri are mutually exclusive\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status s = Env::CreateFromUri(ConfigOptions(), FLAGS_env_uri, FLAGS_fs_uri,
|
|
|
|
&raw_env, &env_guard);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "Error Creating Env URI: %s: %s\n", FLAGS_env_uri.c_str(),
|
|
|
|
s.ToString().c_str());
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
dbsl_env_wrapper_guard = std::make_shared<DbStressEnvWrapper>(raw_env);
|
|
|
|
db_stress_listener_env = dbsl_env_wrapper_guard.get();
|
|
|
|
|
|
|
|
if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection ||
|
|
|
|
FLAGS_write_fault_one_in || FLAGS_open_metadata_write_fault_one_in ||
|
|
|
|
FLAGS_open_write_fault_one_in || FLAGS_open_read_fault_one_in) {
|
|
|
|
FaultInjectionTestFS* fs =
|
|
|
|
new FaultInjectionTestFS(raw_env->GetFileSystem());
|
|
|
|
fault_fs_guard.reset(fs);
|
|
|
|
if (FLAGS_write_fault_one_in) {
|
|
|
|
fault_fs_guard->SetFilesystemDirectWritable(false);
|
|
|
|
} else {
|
|
|
|
fault_fs_guard->SetFilesystemDirectWritable(true);
|
|
|
|
}
|
|
|
|
fault_env_guard =
|
|
|
|
std::make_shared<CompositeEnvWrapper>(raw_env, fault_fs_guard);
|
|
|
|
raw_env = fault_env_guard.get();
|
|
|
|
}
|
|
|
|
|
|
|
|
env_wrapper_guard = std::make_shared<DbStressEnvWrapper>(raw_env);
|
|
|
|
db_stress_env = env_wrapper_guard.get();
|
|
|
|
|
|
|
|
if (FLAGS_write_fault_one_in) {
|
|
|
|
// In the write injection case, we need to use the FS interface and returns
|
|
|
|
// the IOStatus with different error and flags. Therefore,
|
|
|
|
// DbStressEnvWrapper cannot be used which will swallow the FS
|
|
|
|
// implementations. We should directly use the raw_env which is the
|
|
|
|
// CompositeEnvWrapper of env and fault_fs.
|
|
|
|
db_stress_env = raw_env;
|
|
|
|
}
|
|
|
|
|
|
|
|
FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
|
|
|
|
|
|
|
|
// The number of background threads should be at least as much the
|
|
|
|
// max number of concurrent compactions.
|
|
|
|
db_stress_env->SetBackgroundThreads(FLAGS_max_background_compactions,
|
|
|
|
ROCKSDB_NAMESPACE::Env::Priority::LOW);
|
|
|
|
db_stress_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
|
|
|
|
ROCKSDB_NAMESPACE::Env::Priority::BOTTOM);
|
|
|
|
if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size < 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Error: prefixpercent is non-zero while prefix_size is "
|
|
|
|
"not positive!\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (FLAGS_test_batches_snapshots && FLAGS_prefix_size <= 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Error: please specify prefix_size for "
|
|
|
|
"test_batches_snapshots test!\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0 &&
|
|
|
|
!FLAGS_memtable_whole_key_filtering) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Error: please specify positive prefix_size or enable whole key "
|
|
|
|
"filtering in order to use memtable_prefix_bloom_size_ratio\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if ((FLAGS_readpercent + FLAGS_prefixpercent + FLAGS_writepercent +
|
|
|
|
FLAGS_delpercent + FLAGS_delrangepercent + FLAGS_iterpercent +
|
|
|
|
FLAGS_customopspercent) != 100) {
|
|
|
|
fprintf(
|
|
|
|
stderr,
|
|
|
|
"Error: "
|
|
|
|
"Read(-readpercent=%d)+Prefix(-prefixpercent=%d)+Write(-writepercent=%"
|
|
|
|
"d)+Delete(-delpercent=%d)+DeleteRange(-delrangepercent=%d)"
|
|
|
|
"+Iterate(-iterpercent=%d)+CustomOps(-customopspercent=%d) percents != "
|
|
|
|
"100!\n",
|
|
|
|
FLAGS_readpercent, FLAGS_prefixpercent, FLAGS_writepercent,
|
|
|
|
FLAGS_delpercent, FLAGS_delrangepercent, FLAGS_iterpercent,
|
|
|
|
FLAGS_customopspercent);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
|
|
|
|
fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Error: #DB-reopens should be < ops_per_thread\n"
|
|
|
|
"Provided reopens = %d and ops_per_thread = %lu\n",
|
|
|
|
FLAGS_reopen, (unsigned long)FLAGS_ops_per_thread);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (FLAGS_test_batches_snapshots && FLAGS_delrangepercent > 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Error: nonzero delrangepercent unsupported in "
|
|
|
|
"test_batches_snapshots mode\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (FLAGS_active_width > FLAGS_max_key) {
|
|
|
|
fprintf(stderr, "Error: active_width can be at most max_key\n");
|
|
|
|
exit(1);
|
|
|
|
} else if (FLAGS_active_width == 0) {
|
|
|
|
FLAGS_active_width = FLAGS_max_key;
|
|
|
|
}
|
|
|
|
if (FLAGS_value_size_mult * kRandomValueMaxFactor > kValueMaxLen) {
|
|
|
|
fprintf(stderr, "Error: value_size_mult can be at most %d\n",
|
|
|
|
kValueMaxLen / kRandomValueMaxFactor);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (FLAGS_use_merge && FLAGS_nooverwritepercent == 100) {
|
|
|
|
fprintf(
|
|
|
|
stderr,
|
|
|
|
"Error: nooverwritepercent must not be 100 when using merge operands");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (FLAGS_ingest_external_file_one_in > 0 &&
|
|
|
|
FLAGS_nooverwritepercent == 100) {
|
|
|
|
fprintf(
|
|
|
|
stderr,
|
|
|
|
"Error: nooverwritepercent must not be 100 when using file ingestion");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (FLAGS_clear_column_family_one_in > 0 && FLAGS_backup_one_in > 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Error: clear_column_family_one_in must be 0 when using backup\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (FLAGS_test_cf_consistency && FLAGS_disable_wal) {
|
|
|
|
FLAGS_atomic_flush = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (FLAGS_read_only) {
|
|
|
|
if (FLAGS_writepercent != 0 || FLAGS_delpercent != 0 ||
|
|
|
|
FLAGS_delrangepercent != 0) {
|
|
|
|
fprintf(stderr, "Error: updates are not supported in read only mode\n");
|
|
|
|
exit(1);
|
|
|
|
} else if (FLAGS_checkpoint_one_in > 0 &&
|
|
|
|
FLAGS_clear_column_family_one_in > 0) {
|
|
|
|
fprintf(stdout,
|
|
|
|
"Warn: checkpoint won't be validated since column families may "
|
|
|
|
"be dropped.\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Choose a location for the test database if none given with --db=<path>
|
|
|
|
if (FLAGS_db.empty()) {
|
|
|
|
std::string default_db_path;
|
|
|
|
db_stress_env->GetTestDirectory(&default_db_path);
|
|
|
|
default_db_path += "/dbstress";
|
|
|
|
FLAGS_db = default_db_path;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((FLAGS_test_secondary || FLAGS_continuous_verification_interval > 0) &&
|
|
|
|
FLAGS_secondaries_base.empty()) {
|
|
|
|
std::string default_secondaries_path;
|
|
|
|
db_stress_env->GetTestDirectory(&default_secondaries_path);
|
|
|
|
default_secondaries_path += "/dbstress_secondaries";
|
|
|
|
s = db_stress_env->CreateDirIfMissing(default_secondaries_path);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "Failed to create directory %s: %s\n",
|
|
|
|
default_secondaries_path.c_str(), s.ToString().c_str());
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
FLAGS_secondaries_base = default_secondaries_path;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (FLAGS_best_efforts_recovery && !FLAGS_skip_verifydb &&
|
|
|
|
!FLAGS_disable_wal) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"With best-efforts recovery, either skip_verifydb or disable_wal "
|
|
|
|
"should be set to true.\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (FLAGS_skip_verifydb) {
|
|
|
|
if (FLAGS_verify_db_one_in > 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Must set -verify_db_one_in=0 if skip_verifydb is true.\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (FLAGS_continuous_verification_interval > 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Must set -continuous_verification_interval=0 if skip_verifydb "
|
|
|
|
"is true.\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (FLAGS_enable_compaction_filter &&
|
|
|
|
(FLAGS_acquire_snapshot_one_in > 0 || FLAGS_compact_range_one_in > 0 ||
|
|
|
|
FLAGS_iterpercent > 0 || FLAGS_test_batches_snapshots ||
|
|
|
|
FLAGS_test_cf_consistency)) {
|
|
|
|
fprintf(
|
|
|
|
stderr,
|
|
|
|
"Error: acquire_snapshot_one_in, compact_range_one_in, iterpercent, "
|
|
|
|
"test_batches_snapshots must all be 0 when using compaction filter\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (FLAGS_test_multi_ops_txns) {
|
|
|
|
CheckAndSetOptionsForMultiOpsTxnStressTest();
|
|
|
|
}
|
|
|
|
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2 years ago
|
|
|
if (FLAGS_create_timestamped_snapshot_one_in > 0) {
|
|
|
|
if (!FLAGS_use_txn) {
|
|
|
|
fprintf(stderr, "timestamped snapshot supported only in TransactionDB\n");
|
|
|
|
exit(1);
|
|
|
|
} else if (FLAGS_txn_write_policy != 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"timestamped snapshot supported only in write-committed\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
db_stress option to preserve all files until verification success (#10659)
Summary:
In `db_stress`, DB and expected state files containing changes leading up to a verification failure are often deleted, which makes debugging such failures difficult. On the DB side, flushed WAL files and compacted SST files are marked obsolete and then deleted. Without those files, we cannot pinpoint where a key that failed verification changed unexpectedly. On the expected state side, files for verifying prefix-recoverability in the presence of unsynced data loss are deleted before verification. These include a baseline state file containing the expected state at the time of the last successful verification, and a trace file containing all operations since then. Without those files, we cannot know the sequence of DB operations expected to be recovered.
This PR attempts to address this gap with a new `db_stress` flag: `preserve_unverified_changes`. Setting `preserve_unverified_changes=1` has two effects.
First, prior to startup verification, `db_stress` hardlinks all DB and expected state files in "unverified/" subdirectories of `FLAGS_db` and `FLAGS_expected_values_dir`. The separate directories are needed because the pre-verification opening process deletes files written by the previous `db_stress` run as described above. These "unverified/" subdirectories are cleaned up following startup verification success.
I considered other approaches for preserving DB files through startup verification, like using a read-only DB or preventing deletion of DB files externally, e.g., in the `Env` layer. However, I decided against it since such an approach would not work for expected state files, and I did not want to change the DB management logic. If there were a way to disable DB file deletions before regular DB open, I would have preferred to use that.
Second, `db_stress` attempts to keep all DB and expected state files that were live at some point since the start of the `db_stress` run. This is a bit tricky and involves the following changes.
- Open the DB with `disable_auto_compactions=1` and `avoid_flush_during_recovery=1`
- DisableFileDeletions()
- EnableAutoCompactions()
For this part, too, I would have preferred to use a hypothetical API that disables DB file deletion before regular DB open.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10659
Reviewed By: hx235
Differential Revision: D39407454
Pulled By: ajkr
fbshipit-source-id: 6e981025c7dce147649d2e770728471395a7fa53
2 years ago
|
|
|
if (FLAGS_preserve_unverified_changes && FLAGS_reopen != 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Reopen DB is incompatible with preserving unverified changes\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
Support WriteCommit policy with sync_fault_injection=1 (#10624)
Summary:
**Context:**
Prior to this PR, correctness testing with un-sync data loss [disabled](https://github.com/facebook/rocksdb/pull/10605) transaction (`use_txn=1`) thus all of the `txn_write_policy` . This PR improved that by adding support for one policy - WriteCommit (`txn_write_policy=0`).
**Summary:**
They key to this support is (a) handle Mark{Begin, End}Prepare/MarkCommit/MarkRollback in constructing ExpectedState under WriteCommit policy correctly and (b) monitor CI jobs and solve any test incompatibility issue till jobs are stable. (b) will be part of the test plan.
For (a)
- During prepare (i.e, between `MarkBeginPrepare()` and `MarkEndPrepare(xid)`), `ExpectedStateTraceRecordHandler` will buffer all writes by adding all writes to an internal `WriteBatch`.
- On `MarkEndPrepare()`, that `WriteBatch` will be associated with the transaction's `xid`.
- During the commit (i.e, on `MarkCommit(xid)`), `ExpectedStateTraceRecordHandler` will retrieve and iterate the internal `WriteBatch` and finally apply those writes to `ExpectedState`
- During the rollback (i.e, on `MarkRollback(xid)`), `ExpectedStateTraceRecordHandler` will erase the internal `WriteBatch` from the map.
For (b) - one major issue described below:
- TransactionsDB in db stress recovers prepared-but-not-committed txns from the previous crashed run by randomly committing or rolling back it at the start of the current run, see a historical [PR](https://github.com/facebook/rocksdb/commit/6d06be22c083ccf185fd38dba49fde73b644b4c1) predated correctness testing.
- And we will verify those processed keys in a recovered db against their expected state.
- However since now we turn on `sync_fault_injection=1` where the expected state is constructed from the trace instead of using the LATEST.state from previous run. The expected state now used to verify those processed keys won't contain UNKNOWN_SENTINEL as they should - see test 1 for a failed case.
- Therefore, we decided to manually update its expected state to be UNKNOWN_SENTINEL as part of the processing.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10624
Test Plan:
1. Test exposed the major issue described above. This test will fail without setting UNKNOWN_SENTINEL in expected state during the processing and pass after
```
db=/dev/shm/rocksdb_crashtest_blackbox
exp=/dev/shm/rocksdb_crashtest_expected
dbt=$db.tmp
expt=$exp.tmp
rm -rf $db $exp
mkdir -p $exp
echo "RUN 1"
./db_stress \
--clear_column_family_one_in=0 --column_families=1 --db=$db --delpercent=10 --delrangepercent=0 --destroy_db_initially=0 --expected_values_dir=$exp --iterpercent=0 --key_len_percent_dist=1,30,69 --max_key=1000000 --max_key_len=3 --prefixpercent=0 --readpercent=0 --reopen=0 --ops_per_thread=100000000 --test_batches_snapshots=0 --value_size_mult=32 --writepercent=90 \
--use_txn=1 --txn_write_policy=0 --sync_fault_injection=1 &
pid=$!
sleep 0.2
sleep 20
kill $pid
sleep 0.2
echo "RUN 2"
./db_stress \
--clear_column_family_one_in=0 --column_families=1 --db=$db --delpercent=10 --delrangepercent=0 --destroy_db_initially=0 --expected_values_dir=$exp --iterpercent=0 --key_len_percent_dist=1,30,69 --max_key=1000000 --max_key_len=3 --prefixpercent=0 --readpercent=0 --reopen=0 --ops_per_thread=100000000 --test_batches_snapshots=0 --value_size_mult=32 --writepercent=90 \
--use_txn=1 --txn_write_policy=0 --sync_fault_injection=1 &
pid=$!
sleep 0.2
sleep 20
kill $pid
sleep 0.2
echo "RUN 3"
./db_stress \
--clear_column_family_one_in=0 --column_families=1 --db=$db --delpercent=10 --delrangepercent=0 --destroy_db_initially=0 --expected_values_dir=$exp --iterpercent=0 --key_len_percent_dist=1,30,69 --max_key=1000000 --max_key_len=3 --prefixpercent=0 --readpercent=0 --reopen=0 --ops_per_thread=100000000 --test_batches_snapshots=0 --value_size_mult=32 --writepercent=90 \
--use_txn=1 --txn_write_policy=0 --sync_fault_injection=1
```
2. Manual testing to ensure ExpectedState is constructed correctly during recovery by verifying it against previously crashed TransactionDB's WAL.
- Run the following command to crash a TransactionDB with WriteCommit policy. Then `./ldb dump_wal` on its WAL file
```
db=/dev/shm/rocksdb_crashtest_blackbox
exp=/dev/shm/rocksdb_crashtest_expected
rm -rf $db $exp
mkdir -p $exp
./db_stress \
--clear_column_family_one_in=0 --column_families=1 --db=$db --delpercent=10 --delrangepercent=0 --destroy_db_initially=0 --expected_values_dir=$exp --iterpercent=0 --key_len_percent_dist=1,30,69 --max_key=1000000 --max_key_len=3 --prefixpercent=0 --readpercent=0 --reopen=0 --ops_per_thread=100000000 --test_batches_snapshots=0 --value_size_mult=32 --writepercent=90 \
--use_txn=1 --txn_write_policy=0 --sync_fault_injection=1 &
pid=$!
sleep 30
kill $pid
sleep 1
```
- Run the following command to verify recovery of the crashed db under debugger. Compare the step-wise result with WAL records (e.g, WriteBatch content, xid, prepare/commit/rollback marker)
```
./db_stress \
--clear_column_family_one_in=0 --column_families=1 --db=$db --delpercent=10 --delrangepercent=0 --destroy_db_initially=0 --expected_values_dir=$exp --iterpercent=0 --key_len_percent_dist=1,30,69 --max_key=1000000 --max_key_len=3 --prefixpercent=0 --readpercent=0 --reopen=0 --ops_per_thread=100000000 --test_batches_snapshots=0 --value_size_mult=32 --writepercent=90 \
--use_txn=1 --txn_write_policy=0 --sync_fault_injection=1
```
3. Automatic testing by triggering all RocksDB stress/crash test jobs for 3 rounds with no failure.
Reviewed By: ajkr, riversand963
Differential Revision: D39199373
Pulled By: hx235
fbshipit-source-id: 7a1dec0e3e2ee6ea86ddf5dd19ceb5543a3d6f0c
2 years ago
|
|
|
if (FLAGS_use_txn && FLAGS_sync_fault_injection &&
|
|
|
|
FLAGS_txn_write_policy != 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"For TransactionDB, correctness testing with unsync data loss is "
|
|
|
|
"currently compatible with only write committed policy\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
KillPoint* kp = KillPoint::GetInstance();
|
|
|
|
kp->rocksdb_kill_odds = FLAGS_kill_random_test;
|
|
|
|
kp->rocksdb_kill_exclude_prefixes = SplitString(FLAGS_kill_exclude_prefixes);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
unsigned int levels = FLAGS_max_key_len;
|
|
|
|
std::vector<std::string> weights;
|
|
|
|
uint64_t scale_factor = FLAGS_key_window_scale_factor;
|
|
|
|
key_gen_ctx.window = scale_factor * 100;
|
|
|
|
if (!FLAGS_key_len_percent_dist.empty()) {
|
|
|
|
weights = SplitString(FLAGS_key_len_percent_dist);
|
|
|
|
if (weights.size() != levels) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Number of weights in key_len_dist should be equal to"
|
|
|
|
" max_key_len");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t total_weight = 0;
|
|
|
|
for (std::string& weight : weights) {
|
|
|
|
uint64_t val = std::stoull(weight);
|
|
|
|
key_gen_ctx.weights.emplace_back(val * scale_factor);
|
|
|
|
total_weight += val;
|
|
|
|
}
|
|
|
|
if (total_weight != 100) {
|
|
|
|
fprintf(stderr, "Sum of all weights in key_len_dist should be 100");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
uint64_t keys_per_level = key_gen_ctx.window / levels;
|
|
|
|
for (unsigned int level = 0; level + 1 < levels; ++level) {
|
|
|
|
key_gen_ctx.weights.emplace_back(keys_per_level);
|
|
|
|
}
|
|
|
|
key_gen_ctx.weights.emplace_back(key_gen_ctx.window -
|
|
|
|
keys_per_level * (levels - 1));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::unique_ptr<ROCKSDB_NAMESPACE::StressTest> stress;
|
|
|
|
if (FLAGS_test_cf_consistency) {
|
|
|
|
stress.reset(CreateCfConsistencyStressTest());
|
|
|
|
} else if (FLAGS_test_batches_snapshots) {
|
|
|
|
stress.reset(CreateBatchedOpsStressTest());
|
|
|
|
} else if (FLAGS_test_multi_ops_txns) {
|
|
|
|
stress.reset(CreateMultiOpsTxnsStressTest());
|
|
|
|
} else {
|
|
|
|
stress.reset(CreateNonBatchedOpsStressTest());
|
|
|
|
}
|
|
|
|
// Initialize the Zipfian pre-calculated array
|
|
|
|
InitializeHotKeyGenerator(FLAGS_hot_key_alpha);
|
|
|
|
if (RunStressTest(stress.get())) {
|
|
|
|
return 0;
|
|
|
|
} else {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
#endif // GFLAGS
|