[Kill randomly at various points in source code for testing]

Summary: This is initial version. A few ways in which this could be extended in the future are: (a) Killing from more places in source code (b) Hashing stack and using that hash in determining whether to crash. This is to avoid crashing more often at source lines that are executed more often. (c) Raising exceptions or returning errors instead of killing Test Plan: This whole thing is for testing. Here is part of output: python2.7 tools/db_crashtest2.py -d 600 Running db_stress db_stress retncode -15 output LevelDB version : 1.5 Number of threads : 32 Ops per thread : 10000000 Read percentage : 50 Write-buffer-size : 4194304 Delete percentage : 30 Max key : 1000 Ratio #ops/#keys : 320000 Num times DB reopens: 0 Batches/snapshots : 1 Purge redundant % : 50 Num keys per lock : 4 Compression : snappy ------------------------------------------------ No lock creation because test_batches_snapshots set 2013/04/26-17:55:17 Starting database operations Created bg thread 0x7fc1f07ff700 ... finished 60000 ops Running db_stress db_stress retncode -15 output LevelDB version : 1.5 Number of threads : 32 Ops per thread : 10000000 Read percentage : 50 Write-buffer-size : 4194304 Delete percentage : 30 Max key : 1000 Ratio #ops/#keys : 320000 Num times DB reopens: 0 Batches/snapshots : 1 Purge redundant % : 50 Num keys per lock : 4 Compression : snappy ------------------------------------------------ Created bg thread 0x7ff0137ff700 No lock creation because test_batches_snapshots set 2013/04/26-17:56:15 Starting database operations ... finished 90000 ops Revert Plan: OK Task ID: #2252691 Reviewers: dhruba, emayanke Reviewed By: emayanke CC: leveldb, haobo Differential Revision: https://reviews.facebook.net/D10581
13 years ago · 760dd4750f
parent 87d0af15d8
commit 760dd4750f
4 changed files with 195 additions and 5 deletions
--- a/tools/db_crashtest2.py
+++ b/tools/db_crashtest2.py
@ -0,0 +1,108 @@
+#! /usr/bin/env python
+import os
+import sys
+import time
+import shlex
+import getopt
+import logging
+import tempfile
+import subprocess
+
+# This python script runs db_stress multiple times with kill_random_test
+# that causes leveldb to crash at various points in code.
+# It also has test-batches-snapshot ON so that basic atomic/consistency
+# checks can be performed.
+#
+def main(argv):
+    os.system("make -C ~/rocksdb db_stress")
+    try:
+        opts, args = getopt.getopt(argv, "hd:t:k:o:b:")
+    except getopt.GetoptError:
+        print str(getopt.GetoptError)
+        print "db_crashtest2.py -d <duration_test> -t <#threads> " \
+            "-k <kills with prob 1/k> -o <ops_per_thread> "\
+            "-b <write_buffer_size>\n"
+        sys.exit(2)
+
+    # default values, will be overridden by cmdline args
+    kill_random_test = 97  # kill with probability 1/97 by default
+    duration = 6000  # total time for this script to test db_stress
+    threads = 32
+    ops_per_thread = 200000
+    write_buf_size = 4 * 1024 * 1024
+
+    for opt, arg in opts:
+        if opt == '-h':
+            print "db_crashtest2.py -d <duration_test> -t <#threads> " \
+                "-k <kills with prob 1/k> -o <ops_per_thread> "\
+                "-b <write_buffer_size>\n"
+            sys.exit()
+        elif opt == ("-d"):
+            duration = int(arg)
+        elif opt == ("-t"):
+            threads = int(arg)
+        elif opt == ("-k"):
+            kill_random_test = int(arg)
+        elif opt == ("-i"):
+            interval = int(arg)
+        elif opt == ("-o"):
+            ops_per_thread = int(arg)
+        elif opt == ("-b"):
+            write_buf_size = int(arg)
+        else:
+            print "unrecognized option " + str(opt) + "\n"
+            print "db_crashtest2.py -d <duration_test> -t <#threads> " \
+                "-k <kills with prob 1/k> -o <ops_per_thread> " \
+                "-b <write_buffer_size>\n"
+            sys.exit(2)
+
+    exit_time = time.time() + duration
+
+    dirpath = tempfile.mkdtemp()
+
+    # kill in every alternate run. toggle tracks which run we are doing.
+    toggle = True
+
+    while time.time() < exit_time:
+        run_had_errors = False
+        print "Running db_stress \n"
+
+        if toggle:
+            # since we are going to kill anyway, use more ops per thread
+            new_ops_per_thread = 100 * ops_per_thread
+            killoption = '--kill_random_test=' + str(kill_random_test)
+        else:
+            new_ops_per_thread = ops_per_thread
+            killoption = ''
+
+        toggle = not toggle
+
+        cmd = ['~/rocksdb/db_stress \
+                --test_batches_snapshots=1 \
+                --ops_per_thread=0' + str(new_ops_per_thread) + ' \
+                --threads=0' + str(threads) + ' \
+                --write_buffer_size=' + str(write_buf_size) + ' \
+                --destroy_db_initially=0 ' + killoption + ' \
+                --reopen=0 \
+                --readpercent=50 \
+                --db=' + dirpath + ' \
+                --max_key=10000']
+        try:
+            subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
+            if killoption != '':
+                logging.warn("WARNING: db_stress did not kill itself\n")
+            continue
+
+        except subprocess.CalledProcessError as e:
+            msg = "db_stress retncode {0} output {1}".format(e.returncode,
+                                                             e.output)
+            logging.info(msg)
+            print msg
+            msglower = msg.lower()
+            if ('error' in msglower) or ('fail' in msglower):
+                print "TEST FAILED!!!\n"
+                sys.exit(2)
+        time.sleep(1)  # time to stabilize after a kill
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@ -122,6 +122,10 @@ static bool FLAGS_disable_data_sync = false;
 // If true, issue fsync instead of fdatasync
 static bool FLAGS_use_fsync = false;

+// If non-zero, kill at various points in source code with probability 1/this
+static int FLAGS_kill_random_test = 0;
+extern int leveldb_kill_odds;
+
 // If true, do not write WAL for write.
 static bool FLAGS_disable_wal = false;

@ -698,7 +702,7 @@ class StressTest {
        char expected_prefix = (keys[i])[0];
        char actual_prefix = (values[i])[0];
        if (actual_prefix != expected_prefix) {
-          fprintf(stderr, "expected prefix = %c actual = %c\n",
+          fprintf(stderr, "error expected prefix = %c actual = %c\n",
                  expected_prefix, actual_prefix);
        }
        (values[i])[0] = ' '; // blank out the differing character
@ -710,7 +714,7 @@ class StressTest {
    // Now that we retrieved all values, check that they all match
    for (int i = 1; i < 10; i++) {
      if (values[i] != values[0]) {
-        fprintf(stderr, "inconsistent values for key %s: %s, %s\n",
+        fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
                key.ToString().c_str(), values[0].c_str(),
                values[i].c_str());
      // we continue after error rather than exiting so that we can
@ -931,6 +935,7 @@ class StressTest {
    options.env = FLAGS_env;
    options.disableDataSync = FLAGS_disable_data_sync;
    options.use_fsync = FLAGS_use_fsync;
+    leveldb_kill_odds = FLAGS_kill_random_test;
    options.target_file_size_base = FLAGS_target_file_size_base;
    options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
    options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
@ -1093,6 +1098,9 @@ int main(int argc, char** argv) {
    } else if (sscanf(argv[i], "--use_fsync=%d%c", &n, &junk) == 1 &&
        (n == 0 || n == 1)) {
      FLAGS_use_fsync = n;
+    } else if (sscanf(argv[i], "--kill_random_test=%d%c", &n, &junk) == 1 &&
+               (n >= 0)) {
+      FLAGS_kill_random_test = n;
    } else if (sscanf(argv[i], "--disable_wal=%d%c", &n, &junk) == 1 &&
        (n == 0 || n == 1)) {
      FLAGS_disable_wal = n;
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@ -32,6 +32,8 @@
 #include "util/coding.h"
 #include "util/logging.h"
 #include "util/posix_logger.h"
+#include "util/random.h"
+#include <signal.h>

 #if !defined(TMPFS_MAGIC)
 #define TMPFS_MAGIC 0x01021994
@ -48,8 +50,13 @@ bool useFsReadAhead = 1;  // allow filesystem to do readaheads
 bool useMmapRead = 0;     // do not use mmaps for reading files
 bool useMmapWrite = 1;    // use mmaps for appending to files

+// This is only set from db_stress.cc and for testing only.
+// If non-zero, kill at various points in source code with probability 1/this
+int leveldb_kill_odds = 0;
+
 namespace leveldb {

+
 namespace {

 // list of pathnames that are locked
@ -60,6 +67,39 @@ static Status IOError(const std::string& context, int err_number) {
  return Status::IOError(context, strerror(err_number));
 }

+#ifdef NDEBUG
+// empty in release build
+#define TEST_KILL_RANDOM(leveldb_kill_odds)
+#else
+
+// Kill the process with probablity 1/odds for testing.
+static void TestKillRandom(int odds, const std::string& srcfile,
+                           int srcline) {
+  time_t curtime = time(nullptr);
+  Random r((uint32_t)curtime);
+
+  assert(odds > 0);
+  bool crash = r.OneIn(odds);
+  if (crash) {
+    fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
+    fflush(stdout);
+    kill(getpid(), SIGTERM);
+  }
+}
+
+// To avoid crashing always at some frequently executed codepaths (during
+// kill random test), use this factor to reduce odds
+#define REDUCE_ODDS 2
+#define REDUCE_ODDS2 4
+
+#define TEST_KILL_RANDOM(leveldb_kill_odds) {   \
+  if (leveldb_kill_odds > 0) { \
+    TestKillRandom(leveldb_kill_odds, __FILE__, __LINE__);     \
+  } \
+}
+
+#endif
+
 class PosixSequentialFile: public SequentialFile {
 private:
  std::string filename_;
@ -232,6 +272,7 @@ class PosixMmapFile : public WritableFile {

  bool UnmapCurrentRegion() {
    bool result = true;
+    TEST_KILL_RANDOM(leveldb_kill_odds);
    if (base_ != nullptr) {
      if (last_sync_ < limit_) {
        // Defer syncing this data until next Sync() call, if any
@ -257,18 +298,22 @@ class PosixMmapFile : public WritableFile {
  Status MapNewRegion() {
    assert(base_ == nullptr);

+    TEST_KILL_RANDOM(leveldb_kill_odds);
    int alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
    if (alloc_status != 0) {
      return Status::IOError("Error allocating space to file : " + filename_ +
        "Error : " + strerror(alloc_status));
    }

-
+    TEST_KILL_RANDOM(leveldb_kill_odds);
    void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
                     fd_, file_offset_);
    if (ptr == MAP_FAILED) {
      return Status::IOError("MMap failed on " + filename_);
    }
+
+    TEST_KILL_RANDOM(leveldb_kill_odds);
+
    base_ = reinterpret_cast<char*>(ptr);
    limit_ = base_ + map_size_;
    dst_ = base_;
@ -303,6 +348,7 @@ class PosixMmapFile : public WritableFile {
  virtual Status Append(const Slice& data) {
    const char* src = data.data();
    size_t left = data.size();
+    TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS);
    PrepareWrite(GetFileSize(), left);
    while (left > 0) {
      assert(base_ <= dst_);
@ -314,6 +360,7 @@ class PosixMmapFile : public WritableFile {
          if (!s.ok()) {
            return s;
          }
+          TEST_KILL_RANDOM(leveldb_kill_odds);
        }
      }

@ -323,12 +370,16 @@ class PosixMmapFile : public WritableFile {
      src += n;
      left -= n;
    }
+    TEST_KILL_RANDOM(leveldb_kill_odds);
    return Status::OK();
  }

  virtual Status Close() {
    Status s;
    size_t unused = limit_ - dst_;
+
+    TEST_KILL_RANDOM(leveldb_kill_odds);
+
    if (!UnmapCurrentRegion()) {
      s = IOError(filename_, errno);
    } else if (unused > 0) {
@ -338,6 +389,8 @@ class PosixMmapFile : public WritableFile {
      }
    }

+    TEST_KILL_RANDOM(leveldb_kill_odds);
+
    if (close(fd_) < 0) {
      if (s.ok()) {
        s = IOError(filename_, errno);
@ -351,6 +404,7 @@ class PosixMmapFile : public WritableFile {
  }

  virtual Status Flush() {
+    TEST_KILL_RANDOM(leveldb_kill_odds);
    return Status::OK();
  }

@ -359,10 +413,12 @@ class PosixMmapFile : public WritableFile {

    if (pending_sync_) {
      // Some unmapped data was not synced
+      TEST_KILL_RANDOM(leveldb_kill_odds);
      pending_sync_ = false;
      if (fdatasync(fd_) < 0) {
        s = IOError(filename_, errno);
      }
+      TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS);
    }

    if (dst_ > last_sync_) {
@ -371,9 +427,11 @@ class PosixMmapFile : public WritableFile {
      size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
      size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
      last_sync_ = dst_;
+      TEST_KILL_RANDOM(leveldb_kill_odds);
      if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
        s = IOError(filename_, errno);
      }
+      TEST_KILL_RANDOM(leveldb_kill_odds);
    }

    return s;
@ -385,10 +443,12 @@ class PosixMmapFile : public WritableFile {
  virtual Status Fsync() {
    if (pending_sync_) {
      // Some unmapped data was not synced
+      TEST_KILL_RANDOM(leveldb_kill_odds);
      pending_sync_ = false;
      if (fsync(fd_) < 0) {
        return IOError(filename_, errno);
      }
+      TEST_KILL_RANDOM(leveldb_kill_odds);
    }
    // This invocation to Sync will not issue the call to
    // fdatasync because pending_sync_ has already been cleared.
@ -407,6 +467,7 @@ class PosixMmapFile : public WritableFile {

 #ifdef OS_LINUX
  virtual Status Allocate(off_t offset, off_t len) {
+    TEST_KILL_RANDOM(leveldb_kill_odds);
    if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
      return Status::OK();
    } else {
@ -455,6 +516,8 @@ class PosixWritableFile : public WritableFile {
    pending_sync_ = true;
    pending_fsync_ = true;

+    TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS2);
+
    PrepareWrite(GetFileSize(), left);
    // if there is no space in the cache, then flush
    if (cursize_ + left > capacity_) {
@ -481,6 +544,8 @@ class PosixWritableFile : public WritableFile {
        if (done < 0) {
          return IOError(filename_, errno);
        }
+        TEST_KILL_RANDOM(leveldb_kill_odds);
+
        left -= done;
        src += done;
      }
@ -494,6 +559,9 @@ class PosixWritableFile : public WritableFile {
    s = Flush(); // flush cache to OS
    if (!s.ok()) {
    }
+
+    TEST_KILL_RANDOM(leveldb_kill_odds);
+
    if (close(fd_) < 0) {
      if (s.ok()) {
        s = IOError(filename_, errno);
@ -505,6 +573,7 @@ class PosixWritableFile : public WritableFile {

  // write out the cached data to the OS cache
  virtual Status Flush() {
+    TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS2);
    size_t left = cursize_;
    char* src = buf_.get();
    while (left != 0) {
@ -512,6 +581,7 @@ class PosixWritableFile : public WritableFile {
      if (done < 0) {
        return IOError(filename_, errno);
      }
+      TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS2);
      left -= done;
      src += done;
    }
@ -520,17 +590,21 @@ class PosixWritableFile : public WritableFile {
  }

  virtual Status Sync() {
+    TEST_KILL_RANDOM(leveldb_kill_odds);
    if (pending_sync_ && fdatasync(fd_) < 0) {
      return IOError(filename_, errno);
    }
+    TEST_KILL_RANDOM(leveldb_kill_odds);
    pending_sync_ = false;
    return Status::OK();
  }

  virtual Status Fsync() {
+    TEST_KILL_RANDOM(leveldb_kill_odds);
    if (pending_fsync_ && fsync(fd_) < 0) {
      return IOError(filename_, errno);
    }
+    TEST_KILL_RANDOM(leveldb_kill_odds);
    pending_fsync_ = false;
    pending_sync_ = false;
    return Status::OK();
@ -542,6 +616,7 @@ class PosixWritableFile : public WritableFile {

 #ifdef OS_LINUX
  virtual Status Allocate(off_t offset, off_t len) {
+    TEST_KILL_RANDOM(leveldb_kill_odds);
    if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
      return Status::OK();
    } else {
--- a/util/storage_options.h
+++ b/util/storage_options.h
@ -21,8 +21,7 @@ class StorageOptions : public EnvOptions {
    readahead_compactions_(opt.allow_readahead_compactions),
    use_mmap_reads_(opt.allow_mmap_reads),
    use_mmap_writes_(opt.allow_mmap_writes),
-    set_fd_cloexec_(opt.is_fd_close_on_exec)
-    {
+    set_fd_cloexec_(opt.is_fd_close_on_exec) {
  }

  // copy constructor with readaheads set to readahead_compactions_