Compare memtable insert and flush count (#8288)

Summary: When a memtable is flushed, it will validate number of entries it reads, and compare the number with how many entries inserted into memtable. This serves as one sanity c\ heck against memory corruption. This change will also allow more counters to be added in the future for better validation. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8288 Test Plan: Pass all existing tests Reviewed By: ajkr Differential Revision: D28369194 fbshipit-source-id: 7ff870380c41eab7f99eee508550dcdce32838ad
4 years ago · 2f1984dd45
parent 94b4faa0f1
commit 2f1984dd45
14 changed files with 134 additions and 36 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -18,6 +18,7 @@
 * Added more fields to FilterBuildingContext with LSM details, for custom filter policies that vary behavior based on where they are in the LSM-tree.
 * Added DB::Properties::kBlockCacheEntryStats for querying statistics on what percentage of block cache is used by various kinds of blocks, etc. using DB::GetProperty and DB::GetMapProperty. The same information is now dumped to info LOG periodically according to `stats_dump_period_sec`.
 * Add an experimental Remote Compaction feature, which allows the user to run Compaction on a different host or process. The feature is still under development, currently only works on some basic use cases. The interface will be changed without backward/forward compatibility support.
+* RocksDB would validate total entries read in flush, and compare with counter inserted into it. If flush_verify_memtable_count = true (default), flush will fail. Otherwise, only log to info logs.

 ### Performance Improvements
 * BlockPrefetcher is used by iterators to prefetch data if they anticipate more data to be used in future. It is enabled implicitly by rocksdb. Added change to take in account read pattern if reads are sequential. This would disable prefetching for random reads in MultiGet and iterators as readahead_size is increased exponential doing large prefetches.
--- a/db/builder.cc
+++ b/db/builder.cc
@ -69,7 +69,7 @@ Status BuildTable(
    int job_id, const Env::IOPriority io_priority,
    TableProperties* table_properties, Env::WriteLifeTimeHint write_hint,
    const std::string* full_history_ts_low,
-    BlobFileCompletionCallback* blob_callback) {
+    BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries) {
  assert((tboptions.column_family_id ==
          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
         tboptions.column_family_name.empty());
@ -88,7 +88,10 @@ Status BuildTable(
  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
      new CompactionRangeDelAggregator(&tboptions.internal_comparator,
                                       snapshots));
+  uint64_t num_unfragmented_tombstones = 0;
  for (auto& range_del_iter : range_del_iters) {
+    num_unfragmented_tombstones +=
+        range_del_iter->num_unfragmented_tombstones();
    range_del_agg->AddTombstones(std::move(range_del_iter));
  }

@ -231,6 +234,10 @@ Status BuildTable(

    TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
    const bool empty = builder->IsEmpty();
+    if (num_input_entries != nullptr) {
+      *num_input_entries =
+          c_iter.num_input_entry_scanned() + num_unfragmented_tombstones;
+    }
    if (!s.ok() || empty) {
      builder->Abandon();
    } else {
--- a/db/builder.h
+++ b/db/builder.h
@ -65,6 +65,7 @@ extern Status BuildTable(
    TableProperties* table_properties = nullptr,
    Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET,
    const std::string* full_history_ts_low = nullptr,
-    BlobFileCompletionCallback* blob_callback = nullptr);
+    BlobFileCompletionCallback* blob_callback = nullptr,
+    uint64_t* num_input_entries = nullptr);

 }  // namespace ROCKSDB_NAMESPACE
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@ -33,7 +33,6 @@
   (snapshot_checker_ == nullptr || LIKELY(IsInEarliestSnapshot(seq))))

 namespace ROCKSDB_NAMESPACE {
-
 CompactionIterator::CompactionIterator(
    InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
    SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
@ -73,7 +72,10 @@ CompactionIterator::CompactionIterator(
    const std::atomic<int>* manual_compaction_paused,
    const std::shared_ptr<Logger> info_log,
    const std::string* full_history_ts_low)
-    : input_(input),
+    : input_(
+          input, cmp,
+          compaction ==
+              nullptr),  // Now only need to count number of entries in flush.
      cmp_(cmp),
      merge_helper_(merge_helper),
      snapshots_(snapshots),
@ -130,13 +132,13 @@ CompactionIterator::CompactionIterator(
  assert(timestamp_size_ == 0 || !full_history_ts_low_ ||
         timestamp_size_ == full_history_ts_low_->size());
 #endif
-  input_->SetPinnedItersMgr(&pinned_iters_mgr_);
+  input_.SetPinnedItersMgr(&pinned_iters_mgr_);
  TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
 }

 CompactionIterator::~CompactionIterator() {
  // input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime
-  input_->SetPinnedItersMgr(nullptr);
+  input_.SetPinnedItersMgr(nullptr);
 }

 void CompactionIterator::ResetRecordCounts() {
@ -189,7 +191,7 @@ void CompactionIterator::Next() {
    // Only advance the input iterator if there is no merge output and the
    // iterator is not already at the next record.
    if (!at_next_) {
-      input_->Next();
+      AdvanceInputIter();
    }
    NextFromInput();
  }
@ -356,10 +358,10 @@ void CompactionIterator::NextFromInput() {
  at_next_ = false;
  valid_ = false;

-  while (!valid_ && input_->Valid() && !IsPausingManualCompaction() &&
+  while (!valid_ && input_.Valid() && !IsPausingManualCompaction() &&
         !IsShuttingDown()) {
-    key_ = input_->key();
-    value_ = input_->value();
+    key_ = input_.key();
+    value_ = input_.value();
    iter_stats_.num_input_records++;

    Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
@ -559,12 +561,12 @@ void CompactionIterator::NextFromInput() {
      // The easiest way to process a SingleDelete during iteration is to peek
      // ahead at the next key.
      ParsedInternalKey next_ikey;
-      input_->Next();
+      AdvanceInputIter();

      // Check whether the next key exists, is not corrupt, and is the same key
      // as the single delete.
-      if (input_->Valid() &&
-          ParseInternalKey(input_->key(), &next_ikey, allow_data_in_errors_)
+      if (input_.Valid() &&
+          ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
              .ok() &&
          cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
        // Check whether the next key belongs to the same snapshot as the
@ -578,7 +580,7 @@ void CompactionIterator::NextFromInput() {
            // to handle the second SingleDelete

            // First SingleDelete has been skipped since we already called
-            // input_->Next().
+            // input_.Next().
            ++iter_stats_.num_record_drop_obsolete;
            ++iter_stats_.num_single_del_mismatch;
          } else if (has_outputted_key_ ||
@ -600,9 +602,9 @@ void CompactionIterator::NextFromInput() {

            ++iter_stats_.num_record_drop_hidden;
            ++iter_stats_.num_record_drop_obsolete;
-            // Already called input_->Next() once.  Call it a second time to
+            // Already called input_.Next() once.  Call it a second time to
            // skip past the second key.
-            input_->Next();
+            AdvanceInputIter();
          } else {
            // Found a matching value, but we cannot drop both keys since
            // there is an earlier snapshot and we need to leave behind a record
@ -670,7 +672,7 @@ void CompactionIterator::NextFromInput() {
      }

      ++iter_stats_.num_record_drop_hidden;  // rule (A)
-      input_->Next();
+      AdvanceInputIter();
    } else if (compaction_ != nullptr &&
               (ikey_.type == kTypeDeletion ||
                (ikey_.type == kTypeDeletionWithTimestamp &&
@ -706,7 +708,7 @@ void CompactionIterator::NextFromInput() {
      if (!bottommost_level_) {
        ++iter_stats_.num_optimized_del_drop_obsolete;
      }
-      input_->Next();
+      AdvanceInputIter();
    } else if ((ikey_.type == kTypeDeletion ||
                (ikey_.type == kTypeDeletionWithTimestamp &&
                 cmp_with_history_ts_low_ < 0)) &&
@ -717,7 +719,7 @@ void CompactionIterator::NextFromInput() {
      assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel(
                                 ikey_.user_key, &level_ptrs_));
      ParsedInternalKey next_ikey;
-      input_->Next();
+      AdvanceInputIter();
      // Skip over all versions of this key that happen to occur in the same
      // snapshot range as the delete.
      //
@ -725,18 +727,18 @@ void CompactionIterator::NextFromInput() {
      // considered to have a different user key unless the timestamp is older
      // than *full_history_ts_low_.
      while (!IsPausingManualCompaction() && !IsShuttingDown() &&
-             input_->Valid() &&
-             (ParseInternalKey(input_->key(), &next_ikey, allow_data_in_errors_)
+             input_.Valid() &&
+             (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
                  .ok()) &&
             cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) &&
             (prev_snapshot == 0 ||
              DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot))) {
-        input_->Next();
+        AdvanceInputIter();
      }
      // If you find you still need to output a row with this key, we need to output the
      // delete too
-      if (input_->Valid() &&
-          (ParseInternalKey(input_->key(), &next_ikey, allow_data_in_errors_)
+      if (input_.Valid() &&
+          (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
               .ok()) &&
          cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
        valid_ = true;
@ -755,7 +757,7 @@ void CompactionIterator::NextFromInput() {
      // We encapsulate the merge related state machine in a different
      // object to minimize change to the existing flow.
      Status s =
-          merge_helper_->MergeUntil(input_, range_del_agg_, prev_snapshot,
+          merge_helper_->MergeUntil(&input_, range_del_agg_, prev_snapshot,
                                    bottommost_level_, allow_data_in_errors_);
      merge_out_iter_.SeekToFirst();

@ -799,14 +801,14 @@ void CompactionIterator::NextFromInput() {
      if (should_delete) {
        ++iter_stats_.num_record_drop_hidden;
        ++iter_stats_.num_record_drop_range_del;
-        input_->Next();
+        AdvanceInputIter();
      } else {
        valid_ = true;
      }
    }

    if (need_skip) {
-      input_->Seek(skip_until);
+      SkipUntil(skip_until);
    }
  }

--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@ -24,6 +24,49 @@ namespace ROCKSDB_NAMESPACE {

 class BlobFileBuilder;

+// A wrapper of internal iterator whose purpose is to count how
+// many entries there are in the iterator.
+class SequenceIterWrapper : public InternalIterator {
+ public:
+  SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp,
+                      bool need_count_entries)
+      : cmp_(cmp), inner_iter_(iter), need_count_entries_(need_count_entries) {}
+  bool Valid() const override { return inner_iter_->Valid(); }
+  Status status() const override { return inner_iter_->status(); }
+  void Next() override {
+    num_itered_++;
+    inner_iter_->Next();
+  }
+  void Seek(const Slice& target) override {
+    if (!need_count_entries_) {
+      inner_iter_->Seek(target);
+    } else {
+      // For flush cases, we need to count total number of entries, so we
+      // do Next() rather than Seek().
+      while (inner_iter_->Valid() &&
+             cmp_->Compare(inner_iter_->key(), target) < 0) {
+        Next();
+      }
+    }
+  }
+  Slice key() const override { return inner_iter_->key(); }
+  Slice value() const override { return inner_iter_->value(); }
+
+  // Unused InternalIterator methods
+  void SeekToFirst() override { assert(false); }
+  void Prev() override { assert(false); }
+  void SeekForPrev(const Slice& /* target */) override { assert(false); }
+  void SeekToLast() override { assert(false); }
+
+  uint64_t num_itered() const { return num_itered_; }
+
+ private:
+  const Comparator* cmp_;         // not owned
+  InternalIterator* inner_iter_;  // not owned
+  uint64_t num_itered_ = 0;
+  bool need_count_entries_;
+};
+
 class CompactionIterator {
 public:
  // A wrapper around Compaction. Has a much smaller interface, only what
@ -162,6 +205,7 @@ class CompactionIterator {
  bool Valid() const { return valid_; }
  const Slice& user_key() const { return current_user_key_; }
  const CompactionIterationStats& iter_stats() const { return iter_stats_; }
+  uint64_t num_input_entry_scanned() const { return input_.num_itered(); }

 private:
  // Processes the input stream to find the next output
@ -234,7 +278,7 @@ class CompactionIterator {
  static uint64_t ComputeBlobGarbageCollectionCutoffFileNumber(
      const CompactionProxy* compaction);

-  InternalIterator* input_;
+  SequenceIterWrapper input_;
  const Comparator* cmp_;
  MergeHelper* merge_helper_;
  const std::vector<SequenceNumber>* snapshots_;
@ -342,6 +386,10 @@ class CompactionIterator {

  const int level_;

+  void AdvanceInputIter() { input_.Next(); }
+
+  void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); }
+
  bool IsShuttingDown() {
    // This is a best-effort facility, so memory_order_relaxed is sufficient.
    return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@ -4988,7 +4988,7 @@ TEST_F(DBTest2, SameSmallestInSameLevel) {
  ASSERT_OK(Put("key", "2"));
  ASSERT_OK(db_->Merge(WriteOptions(), "key", "3"));
  ASSERT_OK(db_->Merge(WriteOptions(), "key", "4"));
-  Flush();
+  ASSERT_OK(Flush());
  CompactRangeOptions cro;
  cro.change_level = true;
  cro.target_level = 2;
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@ -166,7 +166,6 @@ void FlushJob::RecordFlushIOStats() {
      ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
  IOSTATS_RESET(bytes_written);
 }
-
 void FlushJob::PickMemTable() {
  db_mutex_->AssertHeld();
  assert(!pick_memtable_called);
@ -403,6 +402,7 @@ Status FlushJob::WriteLevel0Table() {
                                   ? current_time
                                   : meta_.oldest_ancester_time;

+      uint64_t num_input_entries = 0;
      IOStatus io_s;
      const std::string* const full_history_ts_low =
          (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_;
@ -420,10 +420,22 @@ Status FlushJob::WriteLevel0Table() {
          earliest_write_conflict_snapshot_, snapshot_checker_,
          mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
          &io_s, io_tracer_, event_logger_, job_context_->job_id, Env::IO_HIGH,
-          &table_properties_, write_hint, full_history_ts_low, blob_callback_);
+          &table_properties_, write_hint, full_history_ts_low, blob_callback_,
+          &num_input_entries);
      if (!io_s.ok()) {
        io_status_ = io_s;
      }
+      if (num_input_entries != total_num_entries && s.ok()) {
+        std::string msg = "Expected " + ToString(total_num_entries) +
+                          " entries in memtables, but read " +
+                          ToString(num_input_entries);
+        ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s",
+                       cfd_->GetName().c_str(), job_context_->job_id,
+                       msg.c_str());
+        if (db_options_.flush_verify_memtable_count) {
+          s = Status::Corruption(msg);
+        }
+      }
      LogFlush(db_options_.info_log);
    }
    ROCKS_LOG_INFO(db_options_.info_log,
--- a/db/range_tombstone_fragmenter.cc
+++ b/db/range_tombstone_fragmenter.cc
@ -25,12 +25,12 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
    return;
  }
  bool is_sorted = true;
-  int num_tombstones = 0;
  InternalKey pinned_last_start_key;
  Slice last_start_key;
+  num_unfragmented_tombstones_ = 0;
  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
-       unfragmented_tombstones->Next(), num_tombstones++) {
-    if (num_tombstones > 0 &&
+       unfragmented_tombstones->Next(), num_unfragmented_tombstones_++) {
+    if (num_unfragmented_tombstones_ > 0 &&
        icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
      is_sorted = false;
      break;
@ -50,8 +50,8 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(

  // Sort the tombstones before fragmenting them.
  std::vector<std::string> keys, values;
-  keys.reserve(num_tombstones);
-  values.reserve(num_tombstones);
+  keys.reserve(num_unfragmented_tombstones_);
+  values.reserve(num_unfragmented_tombstones_);
  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
       unfragmented_tombstones->Next()) {
    keys.emplace_back(unfragmented_tombstones->key().data(),
--- a/db/range_tombstone_fragmenter.h
+++ b/db/range_tombstone_fragmenter.h
@ -68,6 +68,10 @@ struct FragmentedRangeTombstoneList {
  // number in [lower, upper].
  bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const;

+  uint64_t num_unfragmented_tombstones() const {
+    return num_unfragmented_tombstones_;
+  }
+
 private:
  // Given an ordered range tombstone iterator unfragmented_tombstones,
  // "fragment" the tombstones into non-overlapping pieces, and store them in
@ -82,6 +86,7 @@ struct FragmentedRangeTombstoneList {
  std::set<SequenceNumber> seq_set_;
  std::list<std::string> pinned_slices_;
  PinnedIteratorsManager pinned_iters_mgr_;
+  uint64_t num_unfragmented_tombstones_;
 };

 // FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
@ -180,6 +185,10 @@ class FragmentedRangeTombstoneIterator : public InternalIterator {
  SequenceNumber upper_bound() const { return upper_bound_; }
  SequenceNumber lower_bound() const { return lower_bound_; }

+  uint64_t num_unfragmented_tombstones() const {
+    return tombstones_->num_unfragmented_tombstones();
+  }
+
 private:
  using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack;

--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -430,6 +430,13 @@ struct DBOptions {
  // Default: true
  bool paranoid_checks = true;

+  // If true, during memtable flush, RocksDB will validate total entries
+  // read in flush, and compare with counter inserted into it.
+  // The option is here to turn the feature off in case this new validation
+  // feature has a bug.
+  // Default: true
+  bool flush_verify_memtable_count = true;
+
  // If true, the log numbers and sizes of the synced WALs are tracked
  // in MANIFEST, then during DB recovery, if a synced WAL is missing
  // from disk, or the WAL's size does not match the recorded size in
--- a/options/db_options.cc
+++ b/options/db_options.cc
@ -200,6 +200,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
         {offsetof(struct ImmutableDBOptions, paranoid_checks),
          OptionType::kBoolean, OptionVerificationType::kNormal,
          OptionTypeFlags::kNone}},
+        {"flush_verify_memtable_count",
+         {offsetof(struct ImmutableDBOptions, flush_verify_memtable_count),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
        {"track_and_verify_wals_in_manifest",
         {offsetof(struct ImmutableDBOptions,
                   track_and_verify_wals_in_manifest),
@ -503,6 +507,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
      create_missing_column_families(options.create_missing_column_families),
      error_if_exists(options.error_if_exists),
      paranoid_checks(options.paranoid_checks),
+      flush_verify_memtable_count(options.flush_verify_memtable_count),
      track_and_verify_wals_in_manifest(
          options.track_and_verify_wals_in_manifest),
      env(options.env),
@ -598,6 +603,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                   create_if_missing);
  ROCKS_LOG_HEADER(log, "                        Options.paranoid_checks: %d",
                   paranoid_checks);
+  ROCKS_LOG_HEADER(log, "            Options.flush_verify_memtable_count: %d",
+                   flush_verify_memtable_count);
  ROCKS_LOG_HEADER(log,
                   "                              "
                   "Options.track_and_verify_wals_in_manifest: %d",
--- a/options/db_options.h
+++ b/options/db_options.h
@ -24,6 +24,7 @@ struct ImmutableDBOptions {
  bool create_missing_column_families;
  bool error_if_exists;
  bool paranoid_checks;
+  bool flush_verify_memtable_count;
  bool track_and_verify_wals_in_manifest;
  Env* env;
  std::shared_ptr<RateLimiter> rate_limiter;
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@ -65,6 +65,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
      immutable_db_options.create_missing_column_families;
  options.error_if_exists = immutable_db_options.error_if_exists;
  options.paranoid_checks = immutable_db_options.paranoid_checks;
+  options.flush_verify_memtable_count =
+      immutable_db_options.flush_verify_memtable_count;
  options.track_and_verify_wals_in_manifest =
      immutable_db_options.track_and_verify_wals_in_manifest;
  options.env = immutable_db_options.env;
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@ -287,6 +287,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                             "skip_log_error_on_recovery=true;"
                             "writable_file_max_buffer_size=1048576;"
                             "paranoid_checks=true;"
+                             "flush_verify_memtable_count=true;"
                             "track_and_verify_wals_in_manifest=true;"
                             "is_fd_close_on_exec=false;"
                             "bytes_per_sync=4295013613;"