Trace and Replay for RocksDB (#3837)
Summary: A framework for tracing and replaying RocksDB operations. A binary trace file is created by capturing the DB operations, and it can be replayed back at the same rate using db_bench. - Column-families are supported - Multi-threaded tracing is supported. - TraceReader and TraceWriter are exposed to the user, so that tracing to various destinations can be enabled (say, to other messaging/logging services). By default, a FileTraceReader and FileTraceWriter are implemented to capture to a file and replay from it. - This is not yet ideal to be enabled in production due to large performance overhead, but it can be safely tried out in a shadow setup, say, for analyzing RocksDB operations. Currently supported DB operations: - Writes: -- Put -- Merge -- Delete -- SingleDelete -- DeleteRange -- Write - Reads: -- Get (point lookups) Pull Request resolved: https://github.com/facebook/rocksdb/pull/3837 Differential Revision: D7974837 Pulled By: sagar0 fbshipit-source-id: 8ec65aaf336504bc1f6ed0feae67f6ed5ef97a72main
parent
ee7617167f
commit
12b6cdeed3
@ -0,0 +1,47 @@ |
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
|
||||
#pragma once |
||||
|
||||
#include "rocksdb/env.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
// Allow custom implementations of TraceWriter and TraceReader.
|
||||
// By default, RocksDB provides a way to capture the traces to a file using the
|
||||
// factory NewFileTraceWriter(). But users could also choose to export traces to
|
||||
// any other system by providing custom implementations of TraceWriter and
|
||||
// TraceReader.
|
||||
|
||||
// TraceWriter allows exporting RocksDB traces to any system, one operation at
|
||||
// a time.
|
||||
class TraceWriter { |
||||
public: |
||||
TraceWriter() {} |
||||
virtual ~TraceWriter() {} |
||||
|
||||
virtual Status Write(const Slice& data) = 0; |
||||
virtual Status Close() = 0; |
||||
}; |
||||
|
||||
// TraceReader allows reading RocksDB traces from any system, one operation at
|
||||
// a time. A RocksDB Replayer could depend on this to replay opertions.
|
||||
class TraceReader { |
||||
public: |
||||
TraceReader() {} |
||||
virtual ~TraceReader() {} |
||||
|
||||
virtual Status Read(std::string* data) = 0; |
||||
virtual Status Close() = 0; |
||||
}; |
||||
|
||||
// Factory methods to read/write traces from/to a file.
|
||||
Status NewFileTraceWriter(Env* env, const EnvOptions& env_options, |
||||
const std::string& trace_filename, |
||||
std::unique_ptr<TraceWriter>* trace_writer); |
||||
Status NewFileTraceReader(Env* env, const EnvOptions& env_options, |
||||
const std::string& trace_filename, |
||||
std::unique_ptr<TraceReader>* trace_reader); |
||||
} // namespace rocksdb
|
@ -0,0 +1,205 @@ |
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
|
||||
#include "util/trace_replay.h" |
||||
|
||||
#include <chrono> |
||||
#include <sstream> |
||||
#include <thread> |
||||
#include "db/db_impl.h" |
||||
#include "rocksdb/slice.h" |
||||
#include "rocksdb/write_batch.h" |
||||
#include "util/coding.h" |
||||
#include "util/string_util.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
namespace { |
||||
void EncodeCFAndKey(std::string* dst, uint32_t cf_id, const Slice& key) { |
||||
PutFixed32(dst, cf_id); |
||||
PutLengthPrefixedSlice(dst, key); |
||||
} |
||||
|
||||
void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) { |
||||
Slice buf(buffer); |
||||
GetFixed32(&buf, cf_id); |
||||
GetLengthPrefixedSlice(&buf, key); |
||||
} |
||||
} // namespace
|
||||
|
||||
Tracer::Tracer(Env* env, std::unique_ptr<TraceWriter>&& trace_writer) |
||||
: env_(env), trace_writer_(std::move(trace_writer)) { |
||||
WriteHeader(); |
||||
} |
||||
|
||||
Tracer::~Tracer() { trace_writer_.reset(); } |
||||
|
||||
Status Tracer::Write(WriteBatch* write_batch) { |
||||
Trace trace; |
||||
trace.ts = env_->NowMicros(); |
||||
trace.type = kTraceWrite; |
||||
trace.payload = write_batch->Data(); |
||||
return WriteTrace(trace); |
||||
} |
||||
|
||||
Status Tracer::Get(ColumnFamilyHandle* column_family, const Slice& key) { |
||||
Trace trace; |
||||
trace.ts = env_->NowMicros(); |
||||
trace.type = kTraceGet; |
||||
EncodeCFAndKey(&trace.payload, column_family->GetID(), key); |
||||
return WriteTrace(trace); |
||||
} |
||||
|
||||
Status Tracer::WriteHeader() { |
||||
std::ostringstream s; |
||||
s << kTraceMagic << "\t" |
||||
<< "Trace Version: 0.1\t" |
||||
<< "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t" |
||||
<< "Format: Timestamp OpType Payload\n"; |
||||
std::string header(s.str()); |
||||
|
||||
Trace trace; |
||||
trace.ts = env_->NowMicros(); |
||||
trace.type = kTraceBegin; |
||||
trace.payload = header; |
||||
return WriteTrace(trace); |
||||
} |
||||
|
||||
Status Tracer::WriteFooter() { |
||||
Trace trace; |
||||
trace.ts = env_->NowMicros(); |
||||
trace.type = kTraceEnd; |
||||
trace.payload = ""; |
||||
return WriteTrace(trace); |
||||
} |
||||
|
||||
Status Tracer::WriteTrace(const Trace& trace) { |
||||
std::string encoded_trace; |
||||
PutFixed64(&encoded_trace, trace.ts); |
||||
encoded_trace.push_back(trace.type); |
||||
PutFixed32(&encoded_trace, static_cast<uint32_t>(trace.payload.size())); |
||||
encoded_trace.append(trace.payload); |
||||
return trace_writer_->Write(Slice(encoded_trace)); |
||||
} |
||||
|
||||
Status Tracer::Close() { return WriteFooter(); } |
||||
|
||||
Replayer::Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles, |
||||
unique_ptr<TraceReader>&& reader) |
||||
: trace_reader_(std::move(reader)) { |
||||
assert(db != nullptr); |
||||
db_ = static_cast<DBImpl*>(db->GetRootDB()); |
||||
for (ColumnFamilyHandle* cfh : handles) { |
||||
cf_map_[cfh->GetID()] = cfh; |
||||
} |
||||
} |
||||
|
||||
Replayer::~Replayer() { trace_reader_.reset(); } |
||||
|
||||
Status Replayer::Replay() { |
||||
Status s; |
||||
Trace header; |
||||
s = ReadHeader(&header); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
|
||||
std::chrono::system_clock::time_point replay_epoch = |
||||
std::chrono::system_clock::now(); |
||||
WriteOptions woptions; |
||||
ReadOptions roptions; |
||||
Trace trace; |
||||
uint64_t ops = 0; |
||||
while (s.ok()) { |
||||
trace.reset(); |
||||
s = ReadTrace(&trace); |
||||
if (!s.ok()) { |
||||
break; |
||||
} |
||||
|
||||
std::this_thread::sleep_until( |
||||
replay_epoch + std::chrono::microseconds(trace.ts - header.ts)); |
||||
if (trace.type == kTraceWrite) { |
||||
WriteBatch batch(trace.payload); |
||||
db_->Write(woptions, &batch); |
||||
ops++; |
||||
} else if (trace.type == kTraceGet) { |
||||
uint32_t cf_id = 0; |
||||
Slice key; |
||||
DecodeCFAndKey(trace.payload, &cf_id, &key); |
||||
if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) { |
||||
return Status::Corruption("Invalid Column Family ID."); |
||||
} |
||||
|
||||
std::string value; |
||||
if (cf_id == 0) { |
||||
db_->Get(roptions, key, &value); |
||||
} else { |
||||
db_->Get(roptions, cf_map_[cf_id], key, &value); |
||||
} |
||||
ops++; |
||||
} else if (trace.type == kTraceEnd) { |
||||
// Do nothing for now.
|
||||
// TODO: Add some validations later.
|
||||
break; |
||||
} |
||||
} |
||||
|
||||
if (s.IsIncomplete()) { |
||||
// Reaching eof returns Incomplete status at the moment.
|
||||
// Could happen when killing a process without calling EndTrace() API.
|
||||
// TODO: Add better error handling.
|
||||
return Status::OK(); |
||||
} |
||||
return s; |
||||
} |
||||
|
||||
Status Replayer::ReadHeader(Trace* header) { |
||||
assert(header != nullptr); |
||||
Status s = ReadTrace(header); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
if (header->type != kTraceBegin) { |
||||
return Status::Corruption("Corrupted trace file. Incorrect header."); |
||||
} |
||||
if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) { |
||||
return Status::Corruption("Corrupted trace file. Incorrect magic."); |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
Status Replayer::ReadFooter(Trace* footer) { |
||||
assert(footer != nullptr); |
||||
Status s = ReadTrace(footer); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
if (footer->type != kTraceEnd) { |
||||
return Status::Corruption("Corrupted trace file. Incorrect footer."); |
||||
} |
||||
|
||||
// TODO: Add more validations later
|
||||
return s; |
||||
} |
||||
|
||||
Status Replayer::ReadTrace(Trace* trace) { |
||||
assert(trace != nullptr); |
||||
std::string encoded_trace; |
||||
Status s = trace_reader_->Read(&encoded_trace); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
|
||||
Slice enc_slice = Slice(encoded_trace); |
||||
GetFixed64(&enc_slice, &trace->ts); |
||||
trace->type = static_cast<TraceType>(enc_slice[0]); |
||||
enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize); |
||||
trace->payload = enc_slice.ToString(); |
||||
return s; |
||||
} |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,91 @@ |
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
|
||||
#pragma once |
||||
|
||||
#include <memory> |
||||
#include <unordered_map> |
||||
#include <utility> |
||||
|
||||
#include "rocksdb/env.h" |
||||
#include "rocksdb/trace_reader_writer.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class ColumnFamilyHandle; |
||||
class DB; |
||||
class DBImpl; |
||||
class Slice; |
||||
class WriteBatch; |
||||
|
||||
const std::string kTraceMagic = "feedcafedeadbeef"; |
||||
const unsigned int kTraceTimestampSize = 8; |
||||
const unsigned int kTraceTypeSize = 1; |
||||
const unsigned int kTracePayloadLengthSize = 4; |
||||
const unsigned int kTraceMetadataSize = |
||||
kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize; |
||||
|
||||
enum TraceType : char { |
||||
kTraceBegin = 1, |
||||
kTraceEnd = 2, |
||||
kTraceWrite = 3, |
||||
kTraceGet = 4, |
||||
kTraceMax, |
||||
}; |
||||
|
||||
// TODO: This should also be made part of public interface to help users build
|
||||
// custom TracerReaders and TraceWriters.
|
||||
struct Trace { |
||||
uint64_t ts; |
||||
TraceType type; |
||||
std::string payload; |
||||
|
||||
void reset() { |
||||
ts = 0; |
||||
type = kTraceMax; |
||||
payload.clear(); |
||||
} |
||||
}; |
||||
|
||||
// Trace RocksDB operations using a TraceWriter.
|
||||
class Tracer { |
||||
public: |
||||
Tracer(Env* env, std::unique_ptr<TraceWriter>&& trace_writer); |
||||
~Tracer(); |
||||
|
||||
Status Write(WriteBatch* write_batch); |
||||
Status Get(ColumnFamilyHandle* cfname, const Slice& key); |
||||
|
||||
Status Close(); |
||||
|
||||
private: |
||||
Status WriteHeader(); |
||||
Status WriteFooter(); |
||||
Status WriteTrace(const Trace& trace); |
||||
|
||||
Env* env_; |
||||
unique_ptr<TraceWriter> trace_writer_; |
||||
}; |
||||
|
||||
// Replay RocksDB operations from a trace.
|
||||
class Replayer { |
||||
public: |
||||
Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles, |
||||
std::unique_ptr<TraceReader>&& reader); |
||||
~Replayer(); |
||||
|
||||
Status Replay(); |
||||
|
||||
private: |
||||
Status ReadHeader(Trace* header); |
||||
Status ReadFooter(Trace* footer); |
||||
Status ReadTrace(Trace* trace); |
||||
|
||||
DBImpl* db_; |
||||
std::unique_ptr<TraceReader> trace_reader_; |
||||
std::unordered_map<uint32_t, ColumnFamilyHandle*> cf_map_; |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,117 @@ |
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
|
||||
#include "utilities/trace/file_trace_reader_writer.h" |
||||
|
||||
#include "util/coding.h" |
||||
#include "util/file_reader_writer.h" |
||||
#include "util/trace_replay.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
const unsigned int FileTraceReader::kBufferSize = 1024; // 1KB
|
||||
|
||||
FileTraceReader::FileTraceReader( |
||||
std::unique_ptr<RandomAccessFileReader>&& reader) |
||||
: file_reader_(std::move(reader)), |
||||
offset_(0), |
||||
buffer_(new char[kBufferSize]) {} |
||||
|
||||
FileTraceReader::~FileTraceReader() { |
||||
Close(); |
||||
delete[] buffer_; |
||||
} |
||||
|
||||
Status FileTraceReader::Close() { |
||||
file_reader_.reset(); |
||||
return Status::OK(); |
||||
} |
||||
|
||||
Status FileTraceReader::Read(std::string* data) { |
||||
assert(file_reader_ != nullptr); |
||||
Status s = file_reader_->Read(offset_, kTraceMetadataSize, &result_, buffer_); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
if (result_.size() == 0) { |
||||
// No more data to read
|
||||
// Todo: Come up with a better way to indicate end of data. May be this
|
||||
// could be avoided once footer is introduced.
|
||||
return Status::Incomplete(); |
||||
} |
||||
if (result_.size() < kTraceMetadataSize) { |
||||
return Status::Corruption("Corrupted trace file."); |
||||
} |
||||
*data = result_.ToString(); |
||||
offset_ += kTraceMetadataSize; |
||||
|
||||
uint32_t payload_len = |
||||
DecodeFixed32(&buffer_[kTraceTimestampSize + kTraceTypeSize]); |
||||
|
||||
// Read Payload
|
||||
unsigned int bytes_to_read = payload_len; |
||||
unsigned int to_read = |
||||
bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read; |
||||
while (to_read > 0) { |
||||
s = file_reader_->Read(offset_, to_read, &result_, buffer_); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
if (result_.size() < to_read) { |
||||
return Status::Corruption("Corrupted trace file."); |
||||
} |
||||
data->append(result_.data(), result_.size()); |
||||
|
||||
offset_ += to_read; |
||||
bytes_to_read -= to_read; |
||||
to_read = bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read; |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
FileTraceWriter::~FileTraceWriter() { Close(); } |
||||
|
||||
Status FileTraceWriter::Close() { |
||||
file_writer_.reset(); |
||||
return Status::OK(); |
||||
} |
||||
|
||||
Status FileTraceWriter::Write(const Slice& data) { |
||||
return file_writer_->Append(data); |
||||
} |
||||
|
||||
Status NewFileTraceReader(Env* env, const EnvOptions& env_options, |
||||
const std::string& trace_filename, |
||||
std::unique_ptr<TraceReader>* trace_reader) { |
||||
unique_ptr<RandomAccessFile> trace_file; |
||||
Status s = env->NewRandomAccessFile(trace_filename, &trace_file, env_options); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
|
||||
unique_ptr<RandomAccessFileReader> file_reader; |
||||
file_reader.reset( |
||||
new RandomAccessFileReader(std::move(trace_file), trace_filename)); |
||||
trace_reader->reset(new FileTraceReader(std::move(file_reader))); |
||||
return s; |
||||
} |
||||
|
||||
Status NewFileTraceWriter(Env* env, const EnvOptions& env_options, |
||||
const std::string& trace_filename, |
||||
std::unique_ptr<TraceWriter>* trace_writer) { |
||||
unique_ptr<WritableFile> trace_file; |
||||
Status s = env->NewWritableFile(trace_filename, &trace_file, env_options); |
||||
if (!s.ok()) { |
||||
return s; |
||||
} |
||||
|
||||
unique_ptr<WritableFileWriter> file_writer; |
||||
file_writer.reset(new WritableFileWriter(std::move(trace_file), env_options)); |
||||
trace_writer->reset(new FileTraceWriter(std::move(file_writer))); |
||||
return s; |
||||
} |
||||
|
||||
} // namespace rocksdb
|
@ -0,0 +1,47 @@ |
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
|
||||
#pragma once |
||||
|
||||
#include "rocksdb/trace_reader_writer.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
class RandomAccessFileReader; |
||||
class WritableFileWriter; |
||||
|
||||
// FileTraceReader allows reading RocksDB traces from a file.
|
||||
class FileTraceReader : public TraceReader { |
||||
public: |
||||
explicit FileTraceReader(std::unique_ptr<RandomAccessFileReader>&& reader); |
||||
~FileTraceReader(); |
||||
|
||||
virtual Status Read(std::string* data) override; |
||||
virtual Status Close() override; |
||||
|
||||
private: |
||||
unique_ptr<RandomAccessFileReader> file_reader_; |
||||
Slice result_; |
||||
size_t offset_; |
||||
char* const buffer_; |
||||
|
||||
static const unsigned int kBufferSize; |
||||
}; |
||||
|
||||
// FileTraceWriter allows writing RocksDB traces to a file.
|
||||
class FileTraceWriter : public TraceWriter { |
||||
public: |
||||
explicit FileTraceWriter(std::unique_ptr<WritableFileWriter>&& file_writer) |
||||
: file_writer_(std::move(file_writer)) {} |
||||
~FileTraceWriter(); |
||||
|
||||
virtual Status Write(const Slice& data) override; |
||||
virtual Status Close() override; |
||||
|
||||
private: |
||||
unique_ptr<WritableFileWriter> file_writer_; |
||||
}; |
||||
|
||||
} // namespace rocksdb
|
Loading…
Reference in new issue