`db_stress` tolerate incomplete tail records in trace file (#9316)

Summary:
I saw the following error when running crash test for a while with
unsynced data loss:

```
Error restoring historical expected values: Corruption: Corrupted trace file.
```

The trace file turned out to have an incomplete tail record. This is
normal considering blackbox kills `db_stress` while trace can be
ongoing.

In the case where the trace file is not otherwise corrupted, there
should be enough records already seen to sync up the expected state with
the recovered DB. This PR ignores any `Status::Corruption` the
`Replayer` returns when that happens.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9316

Reviewed By: jay-zhuang

Differential Revision: D33230579

Pulled By: ajkr

fbshipit-source-id: 9814af4e39e57f00d85be7404363211762f9b41b
main
Andrew Kryczka 3 years ago committed by Facebook GitHub Bot
parent 791723c1ec
commit b448b71222
  1. 18
      db_stress_tool/expected_state.cc

@ -338,13 +338,14 @@ class ExpectedStateTraceRecordHandler : public TraceRecord::Handler,
ExpectedStateTraceRecordHandler(uint64_t max_write_ops, ExpectedState* state) ExpectedStateTraceRecordHandler(uint64_t max_write_ops, ExpectedState* state)
: max_write_ops_(max_write_ops), state_(state) {} : max_write_ops_(max_write_ops), state_(state) {}
~ExpectedStateTraceRecordHandler() { ~ExpectedStateTraceRecordHandler() { assert(IsDone()); }
assert(num_write_ops_ == max_write_ops_);
} // True if we have already reached the limit on write operations to apply.
bool IsDone() { return num_write_ops_ == max_write_ops_; }
Status Handle(const WriteQueryTraceRecord& record, Status Handle(const WriteQueryTraceRecord& record,
std::unique_ptr<TraceRecordResult>* /* result */) override { std::unique_ptr<TraceRecordResult>* /* result */) override {
if (num_write_ops_ == max_write_ops_) { if (IsDone()) {
return Status::OK(); return Status::OK();
} }
WriteBatch batch(record.GetWriteBatchRep().ToString()); WriteBatch batch(record.GetWriteBatchRep().ToString());
@ -466,7 +467,7 @@ Status FileExpectedStateManager::Restore(DB* db) {
{ {
std::unique_ptr<Replayer> replayer; std::unique_ptr<Replayer> replayer;
std::unique_ptr<ExpectedState> state; std::unique_ptr<ExpectedState> state;
std::unique_ptr<TraceRecord::Handler> handler; std::unique_ptr<ExpectedStateTraceRecordHandler> handler;
if (s.ok()) { if (s.ok()) {
state.reset(new FileExpectedState(latest_file_temp_path, max_key_, state.reset(new FileExpectedState(latest_file_temp_path, max_key_,
num_column_families_)); num_column_families_));
@ -494,6 +495,13 @@ Status FileExpectedStateManager::Restore(DB* db) {
std::unique_ptr<TraceRecordResult> res; std::unique_ptr<TraceRecordResult> res;
record->Accept(handler.get(), &res); record->Accept(handler.get(), &res);
} }
if (s.IsCorruption() && handler->IsDone()) {
// There could be a corruption reading the tail record of the trace due to
// `db_stress` crashing while writing it. It shouldn't matter as long as
// we already found all the write ops we need to catch up the expected
// state.
s = Status::OK();
}
if (s.IsIncomplete()) { if (s.IsIncomplete()) {
// OK because `Status::Incomplete` is expected upon finishing all the // OK because `Status::Incomplete` is expected upon finishing all the
// trace records. // trace records.

Loading…
Cancel
Save