diff --git a/HISTORY.md b/HISTORY.md index 5b24ca866..c778265d2 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -15,6 +15,7 @@ * Fix slow flush/compaction when DB contains many snapshots. The problem became noticeable to us in DBs with 100,000+ snapshots, though it will affect others at different thresholds. * Fix the bug that WriteBatchWithIndex's SeekForPrev() doesn't see the entries with the same key. * Fix the bug where user comparator was sometimes fed with InternalKey instead of the user key. The bug manifests when during GenerateBottommostFiles. +* Fix a bug in WritePrepared txns where if the number of old snapshots goes beyond the snapshot cache size (128 default) the rest will not be checked when evicting a commit entry from the commit cache. ## 5.17.1 (10/30/2018) ## Unreleased diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index 127f8cc86..70b639578 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -816,6 +816,7 @@ TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshotsTest) { std::vector snapshots = {100l, 200l, 300l, 400l, 500l, 600l, 700l, 800l, 900l}; const size_t snapshot_cache_bits = 2; + const uint64_t cache_size = 1ul << snapshot_cache_bits; // Safety check to express the intended size in the test. Can be adjusted if // the snapshots lists changed. assert((1ul << snapshot_cache_bits) * 2 + 1 == snapshots.size()); @@ -843,6 +844,57 @@ TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshotsTest) { commit_entry.prep_seq <= snapshots.back(); ASSERT_EQ(expect_update, !wp_db->old_commit_map_empty_); } + + // Test that search will include multiple snapshot from snapshot cache + { + // exclude first and last item in the cache + CommitEntry commit_entry = {snapshots.front() + 1, + snapshots[cache_size - 1] - 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), cache_size - 2); + } + + // Test that search will include multiple snapshot from old snapshots + { + // include two in the middle + CommitEntry commit_entry = {snapshots[cache_size] + 1, + snapshots[cache_size + 2] + 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), 2); + } + + // Test that search will include both snapshot cache and old snapshots + // Case 1: includes all in snapshot cache + { + CommitEntry commit_entry = {snapshots.front() - 1, snapshots.back() + 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size()); + } + + // Case 2: includes all snapshot caches except the smallest + { + CommitEntry commit_entry = {snapshots.front() + 1, snapshots.back() + 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - 1); + } + + // Case 3: includes only the largest of snapshot cache + { + CommitEntry commit_entry = {snapshots[cache_size - 1] - 1, + snapshots.back() + 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - cache_size + 1); + } } // This test is too slow for travis diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 2d8e4fcee..57da28a77 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -572,14 +572,16 @@ void WritePreparedTxnDB::ReleaseSnapshotInternal( bool need_gc = false; { WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); - ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead"); + ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64, + snap_seq); ReadLock rl(&old_commit_map_mutex_); auto prep_set_entry = old_commit_map_.find(snap_seq); need_gc = prep_set_entry != old_commit_map_.end(); } if (need_gc) { WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); - ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead"); + ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64, + snap_seq); WriteLock wl(&old_commit_map_mutex_); old_commit_map_.erase(snap_seq); old_commit_map_empty_.store(old_commit_map_.empty(), @@ -654,13 +656,20 @@ void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) { // place before gets overwritten the reader that reads bottom-up will // eventully see it. const bool next_is_larger = true; - SequenceNumber snapshot_seq = kMaxSequenceNumber; + // We will set to true if the border line snapshot suggests that. + bool search_larger_list = false; size_t ip1 = std::min(cnt, SNAPSHOT_CACHE_SIZE); for (; 0 < ip1; ip1--) { - snapshot_seq = snapshot_cache_[ip1 - 1].load(std::memory_order_acquire); + SequenceNumber snapshot_seq = + snapshot_cache_[ip1 - 1].load(std::memory_order_acquire); TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:", ++sync_i); TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i); + if (ip1 == SNAPSHOT_CACHE_SIZE) { // border line snapshot + // snapshot_seq < commit_seq => larger_snapshot_seq <= commit_seq + // then later also continue the search to larger snapshots + search_larger_list = snapshot_seq < evicted.commit_seq; + } if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq, snapshot_seq, !next_is_larger)) { break; @@ -675,17 +684,20 @@ void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) { #endif TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:end"); TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:end"); - if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && ip1 == SNAPSHOT_CACHE_SIZE && - snapshot_seq < evicted.prep_seq)) { + if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && search_larger_list)) { // Then access the less efficient list of snapshots_ WPRecordTick(TXN_SNAPSHOT_MUTEX_OVERHEAD); - ROCKS_LOG_WARN(info_log_, "snapshots_mutex_ overhead"); + ROCKS_LOG_WARN(info_log_, + "snapshots_mutex_ overhead for <%" PRIu64 ",%" PRIu64 + "> with %" ROCKSDB_PRIszt " snapshots", + evicted.prep_seq, evicted.commit_seq, cnt); ReadLock rl(&snapshots_mutex_); // Items could have moved from the snapshots_ to snapshot_cache_ before // accquiring the lock. To make sure that we do not miss a valid snapshot, // read snapshot_cache_ again while holding the lock. for (size_t i = 0; i < SNAPSHOT_CACHE_SIZE; i++) { - snapshot_seq = snapshot_cache_[i].load(std::memory_order_acquire); + SequenceNumber snapshot_seq = + snapshot_cache_[i].load(std::memory_order_acquire); if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq, snapshot_seq, next_is_larger)) { break; @@ -713,7 +725,10 @@ bool WritePreparedTxnDB::MaybeUpdateOldCommitMap( // then snapshot_seq < commit_seq if (prep_seq <= snapshot_seq) { // overlapping range WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); - ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead"); + ROCKS_LOG_WARN(info_log_, + "old_commit_map_mutex_ overhead for %" PRIu64 + " commit entry: <%" PRIu64 ",%" PRIu64 ">", + snapshot_seq, prep_seq, commit_seq); WriteLock wl(&old_commit_map_mutex_); old_commit_map_empty_.store(false, std::memory_order_release); auto& vec = old_commit_map_[snapshot_seq]; diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h index 0316a57a9..83562c94d 100644 --- a/utilities/transactions/write_prepared_txn_db.h +++ b/utilities/transactions/write_prepared_txn_db.h @@ -222,7 +222,6 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { // rare case and it is ok to pay the cost of mutex ReadLock for such old, // reading transactions. WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); - ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead"); ReadLock rl(&old_commit_map_mutex_); auto prep_set_entry = old_commit_map_.find(snapshot_seq); bool found = prep_set_entry != old_commit_map_.end();