From fea2b1dfb2c554a5394e213b101516a311142fa9 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Fri, 1 Jun 2018 16:46:32 -0700 Subject: [PATCH] Copy Get() result when file reads use mmap Summary: For iterator reads, a `SuperVersion` is pinned to preserve a snapshot of SST files, and `Block`s are pinned to allow `key()` and `value()` to return pointers directly into a RocksDB memory region. This works for both non-mmap reads, where the block owns the memory region, and mmap reads, where the file owns the memory region. For point reads with `PinnableSlice`, only the `Block` object is pinned. This works for non-mmap reads because the block owns the memory region, so even if the file is deleted after compaction, the memory region survives. However, for mmap reads, file deletion causes the memory region to which the `PinnableSlice` refers to be unmapped. The result is usually a segfault upon accessing the `PinnableSlice`, although sometimes it returned wrong results (I repro'd this a bunch of times with `db_stress`). This PR copies the value into the `PinnableSlice` when it comes from mmap'd memory. We can tell whether the `Block` owns its memory using `Block::cachable()`, which is unset when reads do not use the provided buffer as is the case with mmap file reads. When that is false we ensure the result of `Get()` is copied. This feels like a short-term solution as ideally we'd have the `PinnableSlice` pin the mmap'd memory so we can do zero-copy reads. It seemed hard so I chose this approach to fix correctness in the meantime. Closes https://github.com/facebook/rocksdb/pull/3881 Differential Revision: D8076288 Pulled By: ajkr fbshipit-source-id: 31d78ec010198723522323dbc6ea325122a46b08 --- HISTORY.md | 1 + db/db_test2.cc | 21 +++++++++++++++++++++ table/block.cc | 2 +- table/block.h | 21 +++++++++++++++------ table/block_based_table_reader.cc | 5 +++-- 5 files changed, 41 insertions(+), 9 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index f2db631bb..c633014f6 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ ### Bug Fixes * fix deadlock with enable_pipelined_write=true and max_successive_merges > 0 +* Fix corruption in non-iterator reads when mmap is used for file reads ## 5.14.0 (5/16/2018) ### Public API Change diff --git a/db/db_test2.cc b/db/db_test2.cc index 1550cf551..5043b38a9 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2502,6 +2502,27 @@ TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) { #endif // ROCKSDB_LITE +TEST_F(DBTest2, PinnableSliceAndMmapReads) { + Options options = CurrentOptions(); + options.allow_mmap_reads = true; + Reopen(options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + PinnableSlice pinned_value; + ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); + ASSERT_EQ(pinned_value.ToString(), "bar"); + + dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */, + nullptr /* end */, nullptr /* column_family */, + true /* disallow_trivial_move */); + + // Ensure pinned_value doesn't rely on memory munmap'd by the above + // compaction. + ASSERT_EQ(pinned_value.ToString(), "bar"); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/table/block.cc b/table/block.cc index 15c5cd609..57bb50057 100644 --- a/table/block.cc +++ b/table/block.cc @@ -464,7 +464,7 @@ BlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, total_order_seek ? nullptr : prefix_index_.get(); ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, prefix_index_ptr, global_seqno_, - read_amp_bitmap_.get(), key_includes_seq); + read_amp_bitmap_.get(), key_includes_seq, cachable()); if (read_amp_bitmap_) { if (read_amp_bitmap_->GetStatistics() != stats) { diff --git a/table/block.h b/table/block.h index e2f6e48d0..bac9afc81 100644 --- a/table/block.h +++ b/table/block.h @@ -220,22 +220,26 @@ class BlockIter final : public InternalIterator { key_includes_seq_(true), global_seqno_(kDisableGlobalSequenceNumber), read_amp_bitmap_(nullptr), - last_bitmap_offset_(0) {} + last_bitmap_offset_(0), + block_contents_pinned_(false) {} BlockIter(const Comparator* comparator, const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, BlockPrefixIndex* prefix_index, SequenceNumber global_seqno, - BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq) + BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq, + bool block_contents_pinned) : BlockIter() { Initialize(comparator, user_comparator, data, restarts, num_restarts, - prefix_index, global_seqno, read_amp_bitmap, key_includes_seq); + prefix_index, global_seqno, read_amp_bitmap, key_includes_seq, + block_contents_pinned); } void Initialize(const Comparator* comparator, const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, BlockPrefixIndex* prefix_index, SequenceNumber global_seqno, - BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq) { + BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq, + bool block_contents_pinned) { assert(data_ == nullptr); // Ensure it is called only once assert(num_restarts > 0); // Ensure the param is valid @@ -251,6 +255,7 @@ class BlockIter final : public InternalIterator { read_amp_bitmap_ = read_amp_bitmap; last_bitmap_offset_ = current_ + 1; key_includes_seq_ = key_includes_seq; + block_contents_pinned_ = block_contents_pinned; } // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do @@ -312,9 +317,11 @@ class BlockIter final : public InternalIterator { PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; #endif - virtual bool IsKeyPinned() const override { return key_pinned_; } + virtual bool IsKeyPinned() const override { + return block_contents_pinned_ && key_pinned_; + } - virtual bool IsValuePinned() const override { return true; } + virtual bool IsValuePinned() const override { return block_contents_pinned_; } size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; } @@ -349,6 +356,8 @@ class BlockIter final : public InternalIterator { BlockReadAmpBitmap* read_amp_bitmap_; // last `current_` value we report to read-amp bitmp mutable uint32_t last_bitmap_offset_; + // whether the block data is guaranteed to outlive this iterator + bool block_contents_pinned_; struct CachedPrevEntry { explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr, diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index d37e60e77..2462b880f 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -2226,8 +2226,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, s = Status::Corruption(Slice()); } - if (!get_context->SaveValue(parsed_key, biter.value(), &matched, - &biter)) { + if (!get_context->SaveValue( + parsed_key, biter.value(), &matched, + biter.IsValuePinned() ? &biter : nullptr)) { done = true; break; }