From 59ba104e4a5e381c58ce9a2e0b91432730f1f0e1 Mon Sep 17 00:00:00 2001 From: Jay Zhuang Date: Thu, 18 Feb 2021 08:47:02 -0800 Subject: [PATCH] Fix txn `MultiGet()` return un-committed data with snapshot (#7963) Summary: TransactionDB uses read callback to filter out un-committed data before a snapshot. But `MultiGet()` API doesn't use that at all, which causes returning unwanted data. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7963 Test Plan: Added unittest to reproduce Reviewed By: anand1976 Differential Revision: D26455851 Pulled By: jay-zhuang fbshipit-source-id: 265276698cf9d8c4cd79e3250ef10d14375bac55 --- HISTORY.md | 1 + db/db_impl/db_impl.cc | 7 ++-- utilities/transactions/transaction_test.cc | 41 ++++++++++++++++++++++ 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index fa967aded..67b4e1440 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -13,6 +13,7 @@ ### Bug Fixes * Since 6.15.0, `TransactionDB` returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. There are certain cases where range deletion can still be used on such DBs; see the API doc on `TransactionDB::DeleteRange()` for details. * `OptimisticTransactionDB` now returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. +* Fix `WRITE_PREPARED`, `WRITE_UNPREPARED` TransactionDB `MultiGet()` may return uncommitted data with snapshot. ### Public API Change * Add new Append and PositionedAppend APIs to FileSystem to bring the data verification information (data checksum information) from upper layer (e.g., WritableFileWriter) to the storage layer. In this way, the customized FileSystem is able to verify the correctness of data being written to the storage on time. Add checksum_handoff_file_types to DBOptions. User can use this option to control which file types (Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile.) should use the new Append and PositionedAppend APIs to handoff the verification information. Currently, RocksDB only use crc32c to calculate the checksum for write handoff. diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 68fc3ce63..5a0792ee5 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1719,7 +1719,9 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, } // If timestamp is used, we use read callback to ensure is returned // only if t <= read_opts.timestamp and s <= snapshot. - if (ts_sz > 0 && !get_impl_options.callback) { + if (ts_sz > 0) { + assert(!get_impl_options + .callback); // timestamp with callback is not supported read_cb.Refresh(snapshot); get_impl_options.callback = &read_cb; } @@ -2395,8 +2397,9 @@ void DBImpl::MultiGetWithCallback( } GetWithTimestampReadCallback timestamp_read_callback(0); - ReadCallback* read_callback = nullptr; + ReadCallback* read_callback = callback; if (read_options.timestamp && read_options.timestamp->size() > 0) { + assert(!read_callback); // timestamp with callback is not supported timestamp_read_callback.Refresh(consistent_seqnum); read_callback = ×tamp_read_callback; } diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 27f9504e0..def6d0f6b 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -2925,6 +2925,47 @@ TEST_P(TransactionTest, MultiGetLargeBatchedTest) { } } +TEST_P(TransactionTest, MultiGetSnapshot) { + WriteOptions write_options; + TransactionOptions transaction_options; + Transaction* txn1 = db->BeginTransaction(write_options, transaction_options); + + Slice key = "foo"; + + Status s = txn1->Put(key, "bar"); + ASSERT_OK(s); + + s = txn1->SetName("test"); + ASSERT_OK(s); + + s = txn1->Prepare(); + ASSERT_OK(s); + + // Get snapshot between prepare and commit + // Un-committed data should be invisible to other transactions + const Snapshot* s1 = db->GetSnapshot(); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + Transaction* txn2 = db->BeginTransaction(write_options, transaction_options); + ReadOptions read_options; + read_options.snapshot = s1; + + std::vector keys; + std::vector values(1); + std::vector statuses(1); + keys.push_back(key); + auto cfd = db->DefaultColumnFamily(); + txn2->MultiGet(read_options, cfd, 1, keys.data(), values.data(), + statuses.data()); + ASSERT_TRUE(statuses[0].IsNotFound()); + delete txn2; + + db->ReleaseSnapshot(s1); +} + TEST_P(TransactionTest, ColumnFamiliesTest2) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options;