Added `TraceOptions::preserve_write_order` (#9334)

Summary:
This option causes trace records to be written in the serialized write thread. That way, the write records in the trace must follow the same order as writes that are logged to WAL and writes that are applied to the DB.

By default I left it disabled to match existing behavior. I enabled it in `db_stress`, though, as that use case requires order of write records in trace matches the order in WAL.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9334

Test Plan:
- See if below unsynced data loss crash test can run  for 24h straight. It used to crash after a few hours when reaching an unlucky trace ordering.

```
DEBUG_LEVEL=0 TEST_TMPDIR=/dev/shm /usr/local/bin/python3 -u tools/db_crashtest.py blackbox --interval=10 --max_key=100000 --write_buffer_size=524288 --target_file_size_base=524288 --max_bytes_for_level_base=2097152 --value_size_mult=33 --sync_fault_injection=1 --test_batches_snapshots=0 --duration=86400
```

Reviewed By: zhichao-cao

Differential Revision: D33301990

Pulled By: ajkr

fbshipit-source-id: 82d97559727adb4462a7af69758449c8725b22d3
main
Andrew Kryczka 3 years ago committed by Facebook GitHub Bot
parent 2ee20a669d
commit aa2b3bf675
  1. 1
      HISTORY.md
  2. 40
      db/db_impl/db_impl_write.cc
  3. 1
      db_stress_tool/expected_state.cc
  4. 7
      include/rocksdb/options.h
  5. 4
      trace_replay/trace_replay.h

@ -2,6 +2,7 @@
## Unreleased
### Public API change
* Added values to `TraceFilterType`: `kTraceFilterIteratorSeek`, `kTraceFilterIteratorSeekForPrev`, and `kTraceFilterMultiGet`. They can be set in `TraceOptions` to filter out the operation types after which they are named.
* Added `TraceOptions::preserve_write_order`. When enabled it guarantees write records are traced in the same order they are logged to WAL and applied to the DB. By default it is disabled (false) to match the legacy behavior and prevent regression.
## 6.28.0 (2021-12-17)
### New Features

@ -75,9 +75,14 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
if (my_batch == nullptr) {
return Status::Corruption("Batch is nullptr!");
}
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
// grabs but does not seem thread-safe.
if (tracer_) {
InstrumentedMutexLock lock(&trace_mutex_);
if (tracer_) {
if (tracer_ && !tracer_->IsWriteOrderPreserved()) {
// We don't have to preserve write order so can trace anywhere. It's more
// efficient to trace here than to add latency to a phase of the log/apply
// pipeline.
// TODO: maybe handle the tracing status?
tracer_->Write(my_batch).PermitUncheckedError();
}
@ -249,6 +254,17 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
IOStatus io_s;
Status pre_release_cb_status;
if (status.ok()) {
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
// grabs but does not seem thread-safe.
if (tracer_) {
InstrumentedMutexLock lock(&trace_mutex_);
if (tracer_ && tracer_->IsWriteOrderPreserved()) {
for (auto* writer : write_group) {
// TODO: maybe handle the tracing status?
tracer_->Write(writer->batch).PermitUncheckedError();
}
}
}
// Rules for when we can update the memtable concurrently
// 1. supported by memtable
// 2. Puts are not okay if inplace_update_support
@ -498,6 +514,17 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
size_t total_byte_size = 0;
if (w.status.ok()) {
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
// grabs but does not seem thread-safe.
if (tracer_) {
InstrumentedMutexLock lock(&trace_mutex_);
if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
for (auto* writer : wal_write_group) {
// TODO: maybe handle the tracing status?
tracer_->Write(writer->batch).PermitUncheckedError();
}
}
}
SequenceNumber next_sequence = current_sequence;
for (auto writer : wal_write_group) {
if (writer->CheckCallback(this)) {
@ -722,6 +749,17 @@ Status DBImpl::WriteImplWALOnly(
write_thread->EnterAsBatchGroupLeader(&w, &write_group);
// Note: no need to update last_batch_group_size_ here since the batch writes
// to WAL only
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
// grabs but does not seem thread-safe.
if (tracer_) {
InstrumentedMutexLock lock(&trace_mutex_);
if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
for (auto* writer : write_group) {
// TODO: maybe handle the tracing status?
tracer_->Write(writer->batch).PermitUncheckedError();
}
}
}
size_t pre_release_callback_cnt = 0;
size_t total_byte_size = 0;

@ -301,6 +301,7 @@ Status FileExpectedStateManager::SaveAtAndAfter(DB* db) {
trace_opts.filter |= kTraceFilterMultiGet;
trace_opts.filter |= kTraceFilterIteratorSeek;
trace_opts.filter |= kTraceFilterIteratorSeekForPrev;
trace_opts.preserve_write_order = true;
s = db->StartTrace(trace_opts, std::move(trace_writer));
}

@ -1870,6 +1870,13 @@ struct TraceOptions {
uint64_t sampling_frequency = 1;
// Note: The filtering happens before sampling.
uint64_t filter = kTraceFilterNone;
// When true, the order of write records in the trace will match the order of
// the corresponding write records in the WAL and applied to the DB. There may
// be a performance penalty associated with preserving this ordering.
//
// Default: false. This means write records in the trace may be in an order
// different from the WAL's order.
bool preserve_write_order = false;
};
// ImportColumnFamilyOptions is used by ImportColumnFamily()

@ -150,6 +150,10 @@ class Tracer {
// False otherwise.
bool IsTraceFileOverMax();
// Returns true if the order of write trace records must match the order of
// the corresponding records logged to WAL and applied to the DB.
bool IsWriteOrderPreserved() { return trace_options_.preserve_write_order; }
// Writes a trace footer at the end of the tracing
Status Close();

Loading…
Cancel
Save