Improve fault injection to MultiRead (#8937)

Summary:
Several improvements to MultiRead:
1. Fix a bug in stress test which causes false positive when both MultiRead() return and individual read request have failure injected.
2. Add two more types of fault that should be handled: empty read results and checksum mismatch
3. Add a message indicating which type of fault is injected
4. Increase the failure rate

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8937

Reviewed By: anand1976

Differential Revision: D31085930

fbshipit-source-id: 3a04994a3cadebf9a64d25e1fe12b14b7a272fba
main
sdong 3 years ago committed by Facebook GitHub Bot
parent fcce1f2c7a
commit 9320067703
  1. 4
      db_stress_tool/no_batched_ops_stress.cc
  2. 2
      tools/db_crashtest.py
  3. 64
      utilities/fault_injection_fs.cc
  4. 9
      utilities/fault_injection_fs.h

@ -349,7 +349,9 @@ class NonBatchedOpsStressTest : public StressTest {
// Grab mutex so multiple thread don't try to print the // Grab mutex so multiple thread don't try to print the
// stack trace at the same time // stack trace at the same time
MutexLock l(thread->shared->GetMutex()); MutexLock l(thread->shared->GetMutex());
fprintf(stderr, "Didn't get expected error from MultiGet\n"); fprintf(stderr, "Didn't get expected error from MultiGet. \n");
fprintf(stderr, "num_keys %zu Expected %d errors, seen %d\n", num_keys,
error_count, stat_nok);
fprintf(stderr, "Callstack that injected the fault\n"); fprintf(stderr, "Callstack that injected the fault\n");
fault_fs_guard->PrintFaultBacktrace(); fault_fs_guard->PrintFaultBacktrace();
std::terminate(); std::terminate();

@ -140,7 +140,7 @@ default_params = {
"continuous_verification_interval" : 0, "continuous_verification_interval" : 0,
"max_key_len": 3, "max_key_len": 3,
"key_len_percent_dist": "1,30,69", "key_len_percent_dist": "1,30,69",
"read_fault_one_in": lambda: random.choice([0, 1000]), "read_fault_one_in": lambda: random.choice([0, 32, 1000]),
"open_metadata_write_fault_one_in": lambda: random.choice([0, 0, 8]), "open_metadata_write_fault_one_in": lambda: random.choice([0, 0, 8]),
"open_write_fault_one_in": lambda: random.choice([0, 0, 16]), "open_write_fault_one_in": lambda: random.choice([0, 0, 16]),
"open_read_fault_one_in": lambda: random.choice([0, 0, 32]), "open_read_fault_one_in": lambda: random.choice([0, 0, 32]),

@ -26,6 +26,7 @@
#include "util/coding.h" #include "util/coding.h"
#include "util/crc32c.h" #include "util/crc32c.h"
#include "util/random.h" #include "util/random.h"
#include "util/string_util.h"
#include "util/xxhash.h" #include "util/xxhash.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
@ -340,7 +341,7 @@ IOStatus TestFSRandomAccessFile::Read(uint64_t offset, size_t n,
if (s.ok()) { if (s.ok()) {
s = fs_->InjectThreadSpecificReadError( s = fs_->InjectThreadSpecificReadError(
FaultInjectionTestFS::ErrorOperation::kRead, result, use_direct_io(), FaultInjectionTestFS::ErrorOperation::kRead, result, use_direct_io(),
scratch); scratch, /*need_count_increase=*/true, /*fault_injected=*/nullptr);
} }
if (s.ok() && fs_->ShouldInjectRandomReadError()) { if (s.ok() && fs_->ShouldInjectRandomReadError()) {
return IOStatus::IOError("Injected read error"); return IOStatus::IOError("Injected read error");
@ -355,19 +356,25 @@ IOStatus TestFSRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
return fs_->GetError(); return fs_->GetError();
} }
IOStatus s = target_->MultiRead(reqs, num_reqs, options, dbg); IOStatus s = target_->MultiRead(reqs, num_reqs, options, dbg);
bool injected_error = false;
for (size_t i = 0; i < num_reqs; i++) { for (size_t i = 0; i < num_reqs; i++) {
if (!reqs[i].status.ok()) { if (!reqs[i].status.ok()) {
// Already seeing an error. // Already seeing an error.
break; break;
} }
bool this_injected_error;
reqs[i].status = fs_->InjectThreadSpecificReadError( reqs[i].status = fs_->InjectThreadSpecificReadError(
FaultInjectionTestFS::ErrorOperation::kRead, &reqs[i].result, FaultInjectionTestFS::ErrorOperation::kMultiReadSingleReq,
use_direct_io(), reqs[i].scratch); &(reqs[i].result), use_direct_io(), reqs[i].scratch,
/*need_count_increase=*/true,
/*fault_injected=*/&this_injected_error);
injected_error |= this_injected_error;
} }
if (s.ok()) { if (s.ok()) {
s = fs_->InjectThreadSpecificReadError( s = fs_->InjectThreadSpecificReadError(
FaultInjectionTestFS::ErrorOperation::kRead, nullptr, use_direct_io(), FaultInjectionTestFS::ErrorOperation::kMultiRead, nullptr,
nullptr); use_direct_io(), nullptr, /*need_count_increase=*/!injected_error,
/*fault_injected=*/nullptr);
} }
if (s.ok() && fs_->ShouldInjectRandomReadError()) { if (s.ok() && fs_->ShouldInjectRandomReadError()) {
return IOStatus::IOError("Injected read error"); return IOStatus::IOError("Injected read error");
@ -550,7 +557,9 @@ IOStatus FaultInjectionTestFS::NewRandomAccessFile(
return IOStatus::IOError("Injected error when open random access file"); return IOStatus::IOError("Injected error when open random access file");
} }
IOStatus io_s = InjectThreadSpecificReadError(ErrorOperation::kOpen, nullptr, IOStatus io_s = InjectThreadSpecificReadError(ErrorOperation::kOpen, nullptr,
false, nullptr); false, nullptr,
/*need_count_increase=*/true,
/*fault_injected=*/nullptr);
if (io_s.ok()) { if (io_s.ok()) {
io_s = target()->NewRandomAccessFile(fname, file_opts, result, dbg); io_s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
} }
@ -759,8 +768,11 @@ void FaultInjectionTestFS::UntrackFile(const std::string& f) {
} }
IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError( IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError(
ErrorOperation /*op*/, Slice* /*result*/, bool /*direct_io*/, ErrorOperation op, Slice* result, bool direct_io, char* /*scratch*/,
char* /*scratch*/) { bool need_count_increase, bool* fault_injected) {
bool dummy_bool;
bool& ret_fault_injected = fault_injected ? *fault_injected : dummy_bool;
ret_fault_injected = false;
ErrorContext* ctx = ErrorContext* ctx =
static_cast<ErrorContext*>(thread_local_error_->Get()); static_cast<ErrorContext*>(thread_local_error_->Get());
if (ctx == nullptr || !ctx->enable_error_injection || !ctx->one_in) { if (ctx == nullptr || !ctx->enable_error_injection || !ctx->one_in) {
@ -768,12 +780,47 @@ IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError(
} }
if (ctx->rand.OneIn(ctx->one_in)) { if (ctx->rand.OneIn(ctx->one_in)) {
if (ctx->count == 0) {
ctx->message = "";
}
if (need_count_increase) {
ctx->count++; ctx->count++;
}
if (ctx->callstack) { if (ctx->callstack) {
free(ctx->callstack); free(ctx->callstack);
} }
ctx->callstack = port::SaveStack(&ctx->frames); ctx->callstack = port::SaveStack(&ctx->frames);
if (op != ErrorOperation::kMultiReadSingleReq) {
// Likely non-per read status code for MultiRead
ctx->message += "error; ";
ret_fault_injected = true;
return IOStatus::IOError(); return IOStatus::IOError();
} else if (Random::GetTLSInstance()->OneIn(8)) {
assert(result);
// For a small chance, set the failure to status but turn the
// result to be empty, which is supposed to be caught for a check.
*result = Slice();
ctx->message += "inject empty result; ";
ret_fault_injected = true;
} else if (!direct_io && Random::GetTLSInstance()->OneIn(7)) {
assert(result);
// With direct I/O, many extra bytes might be read so corrupting
// one byte might not cause checksum mismatch. Skip checksum
// corruption injection.
// For a small chance, set the failure to status but corrupt the
// result in a way that checksum checking is supposed to fail.
// Corrupt the last byte, which is supposed to be a checksum byte
// It would work for CRC. Not 100% sure for xxhash and will adjust
// if it is not the case.
const_cast<char*>(result->data())[result->size() - 1]++;
ctx->message += "corrupt last byte; ";
ret_fault_injected = true;
} else {
ctx->message += "error result multiget single; ";
ret_fault_injected = true;
return IOStatus::IOError();
}
} }
return IOStatus::OK(); return IOStatus::OK();
} }
@ -835,6 +882,7 @@ void FaultInjectionTestFS::PrintFaultBacktrace() {
return; return;
} }
fprintf(stderr, "Injected error type = %d\n", ctx->type); fprintf(stderr, "Injected error type = %d\n", ctx->type);
fprintf(stderr, "Message: %s\n", ctx->message.c_str());
port::PrintAndFreeStack(ctx->callstack, ctx->frames); port::PrintAndFreeStack(ctx->callstack, ctx->frames);
ctx->callstack = nullptr; ctx->callstack = nullptr;
#endif #endif

@ -370,6 +370,8 @@ class FaultInjectionTestFS : public FileSystemWrapper {
// Specify what the operation, so we can inject the right type of error // Specify what the operation, so we can inject the right type of error
enum ErrorOperation : char { enum ErrorOperation : char {
kRead = 0, kRead = 0,
kMultiReadSingleReq = 1,
kMultiRead = 2,
kOpen, kOpen,
}; };
@ -440,8 +442,12 @@ class FaultInjectionTestFS : public FileSystemWrapper {
// corruption in the contents of scratch, or truncation of slice // corruption in the contents of scratch, or truncation of slice
// are the types of error with equal probability. For OPEN, // are the types of error with equal probability. For OPEN,
// its always an IOError. // its always an IOError.
// fault_injected returns whether a fault is injected. It is needed
// because some fault is inected with IOStatus to be OK.
IOStatus InjectThreadSpecificReadError(ErrorOperation op, Slice* slice, IOStatus InjectThreadSpecificReadError(ErrorOperation op, Slice* slice,
bool direct_io, char* scratch); bool direct_io, char* scratch,
bool need_count_increase,
bool* fault_injected);
// Get the count of how many times we injected since the previous call // Get the count of how many times we injected since the previous call
int GetAndResetErrorCount() { int GetAndResetErrorCount() {
@ -525,6 +531,7 @@ class FaultInjectionTestFS : public FileSystemWrapper {
int count; int count;
bool enable_error_injection; bool enable_error_injection;
void* callstack; void* callstack;
std::string message;
int frames; int frames;
ErrorType type; ErrorType type;

Loading…
Cancel
Save