diff --git a/HISTORY.md b/HISTORY.md index 8b0bd1b70..6e0c54e53 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## Unreleased ### Public API change * Added values to `TraceFilterType`: `kTraceFilterIteratorSeek`, `kTraceFilterIteratorSeekForPrev`, and `kTraceFilterMultiGet`. They can be set in `TraceOptions` to filter out the operation types after which they are named. +* Added `TraceOptions::preserve_write_order`. When enabled it guarantees write records are traced in the same order they are logged to WAL and applied to the DB. By default it is disabled (false) to match the legacy behavior and prevent regression. ## 6.28.0 (2021-12-17) ### New Features diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index b3d57643d..72a47d83a 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -75,9 +75,14 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, if (my_batch == nullptr) { return Status::Corruption("Batch is nullptr!"); } + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. if (tracer_) { InstrumentedMutexLock lock(&trace_mutex_); - if (tracer_) { + if (tracer_ && !tracer_->IsWriteOrderPreserved()) { + // We don't have to preserve write order so can trace anywhere. It's more + // efficient to trace here than to add latency to a phase of the log/apply + // pipeline. // TODO: maybe handle the tracing status? tracer_->Write(my_batch).PermitUncheckedError(); } @@ -249,6 +254,17 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, IOStatus io_s; Status pre_release_cb_status; if (status.ok()) { + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : write_group) { + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } // Rules for when we can update the memtable concurrently // 1. supported by memtable // 2. Puts are not okay if inplace_update_support @@ -498,6 +514,17 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, size_t total_byte_size = 0; if (w.status.ok()) { + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : wal_write_group) { + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } SequenceNumber next_sequence = current_sequence; for (auto writer : wal_write_group) { if (writer->CheckCallback(this)) { @@ -722,6 +749,17 @@ Status DBImpl::WriteImplWALOnly( write_thread->EnterAsBatchGroupLeader(&w, &write_group); // Note: no need to update last_batch_group_size_ here since the batch writes // to WAL only + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : write_group) { + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } size_t pre_release_callback_cnt = 0; size_t total_byte_size = 0; diff --git a/db_stress_tool/expected_state.cc b/db_stress_tool/expected_state.cc index 6c2c51abc..c787ff757 100644 --- a/db_stress_tool/expected_state.cc +++ b/db_stress_tool/expected_state.cc @@ -301,6 +301,7 @@ Status FileExpectedStateManager::SaveAtAndAfter(DB* db) { trace_opts.filter |= kTraceFilterMultiGet; trace_opts.filter |= kTraceFilterIteratorSeek; trace_opts.filter |= kTraceFilterIteratorSeekForPrev; + trace_opts.preserve_write_order = true; s = db->StartTrace(trace_opts, std::move(trace_writer)); } diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 2f18b132e..a8123328c 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1870,6 +1870,13 @@ struct TraceOptions { uint64_t sampling_frequency = 1; // Note: The filtering happens before sampling. uint64_t filter = kTraceFilterNone; + // When true, the order of write records in the trace will match the order of + // the corresponding write records in the WAL and applied to the DB. There may + // be a performance penalty associated with preserving this ordering. + // + // Default: false. This means write records in the trace may be in an order + // different from the WAL's order. + bool preserve_write_order = false; }; // ImportColumnFamilyOptions is used by ImportColumnFamily() diff --git a/trace_replay/trace_replay.h b/trace_replay/trace_replay.h index 4990accef..9aba5ceb7 100644 --- a/trace_replay/trace_replay.h +++ b/trace_replay/trace_replay.h @@ -150,6 +150,10 @@ class Tracer { // False otherwise. bool IsTraceFileOverMax(); + // Returns true if the order of write trace records must match the order of + // the corresponding records logged to WAL and applied to the DB. + bool IsWriteOrderPreserved() { return trace_options_.preserve_write_order; } + // Writes a trace footer at the end of the tracing Status Close();