Copy Get() result when file reads use mmap

Summary:
For iterator reads, a `SuperVersion` is pinned to preserve a snapshot of SST files, and `Block`s are pinned to allow `key()` and `value()` to return pointers directly into a RocksDB memory region. This works for both non-mmap reads, where the block owns the memory region, and mmap reads, where the file owns the memory region.

For point reads with `PinnableSlice`, only the `Block` object is pinned. This works for non-mmap reads because the block owns the memory region, so even if the file is deleted after compaction, the memory region survives. However, for mmap reads, file deletion causes the memory region to which the `PinnableSlice` refers to be unmapped.   The result is usually a segfault upon accessing the `PinnableSlice`, although sometimes it returned wrong results (I repro'd this a bunch of times with `db_stress`).

This PR copies the value into the `PinnableSlice` when it comes from mmap'd memory. We can tell whether the `Block` owns its memory using `Block::cachable()`, which is unset when reads do not use the provided buffer as is the case with mmap file reads. When that is false we ensure the result of `Get()` is copied.

This feels like a short-term solution as ideally we'd have the `PinnableSlice` pin the mmap'd memory so we can do zero-copy reads. It seemed hard so I chose this approach to fix correctness in the meantime.
Closes https://github.com/facebook/rocksdb/pull/3881

Differential Revision: D8076288

Pulled By: ajkr

fbshipit-source-id: 31d78ec010198723522323dbc6ea325122a46b08
main
Andrew Kryczka 7 years ago committed by Facebook Github Bot
parent 88c3ee2d31
commit fea2b1dfb2
  1. 1
      HISTORY.md
  2. 21
      db/db_test2.cc
  3. 2
      table/block.cc
  4. 21
      table/block.h
  5. 5
      table/block_based_table_reader.cc

@ -9,6 +9,7 @@
### Bug Fixes
* fix deadlock with enable_pipelined_write=true and max_successive_merges > 0
* Fix corruption in non-iterator reads when mmap is used for file reads
## 5.14.0 (5/16/2018)
### Public API Change

@ -2502,6 +2502,27 @@ TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) {
#endif // ROCKSDB_LITE
TEST_F(DBTest2, PinnableSliceAndMmapReads) {
Options options = CurrentOptions();
options.allow_mmap_reads = true;
Reopen(options);
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Flush());
PinnableSlice pinned_value;
ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
ASSERT_EQ(pinned_value.ToString(), "bar");
dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */,
nullptr /* end */, nullptr /* column_family */,
true /* disallow_trivial_move */);
// Ensure pinned_value doesn't rely on memory munmap'd by the above
// compaction.
ASSERT_EQ(pinned_value.ToString(), "bar");
}
} // namespace rocksdb
int main(int argc, char** argv) {

@ -464,7 +464,7 @@ BlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
total_order_seek ? nullptr : prefix_index_.get();
ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
prefix_index_ptr, global_seqno_,
read_amp_bitmap_.get(), key_includes_seq);
read_amp_bitmap_.get(), key_includes_seq, cachable());
if (read_amp_bitmap_) {
if (read_amp_bitmap_->GetStatistics() != stats) {

@ -220,22 +220,26 @@ class BlockIter final : public InternalIterator {
key_includes_seq_(true),
global_seqno_(kDisableGlobalSequenceNumber),
read_amp_bitmap_(nullptr),
last_bitmap_offset_(0) {}
last_bitmap_offset_(0),
block_contents_pinned_(false) {}
BlockIter(const Comparator* comparator, const Comparator* user_comparator,
const char* data, uint32_t restarts, uint32_t num_restarts,
BlockPrefixIndex* prefix_index, SequenceNumber global_seqno,
BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq)
BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq,
bool block_contents_pinned)
: BlockIter() {
Initialize(comparator, user_comparator, data, restarts, num_restarts,
prefix_index, global_seqno, read_amp_bitmap, key_includes_seq);
prefix_index, global_seqno, read_amp_bitmap, key_includes_seq,
block_contents_pinned);
}
void Initialize(const Comparator* comparator,
const Comparator* user_comparator, const char* data,
uint32_t restarts, uint32_t num_restarts,
BlockPrefixIndex* prefix_index, SequenceNumber global_seqno,
BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq) {
BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq,
bool block_contents_pinned) {
assert(data_ == nullptr); // Ensure it is called only once
assert(num_restarts > 0); // Ensure the param is valid
@ -251,6 +255,7 @@ class BlockIter final : public InternalIterator {
read_amp_bitmap_ = read_amp_bitmap;
last_bitmap_offset_ = current_ + 1;
key_includes_seq_ = key_includes_seq;
block_contents_pinned_ = block_contents_pinned;
}
// Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
@ -312,9 +317,11 @@ class BlockIter final : public InternalIterator {
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
#endif
virtual bool IsKeyPinned() const override { return key_pinned_; }
virtual bool IsKeyPinned() const override {
return block_contents_pinned_ && key_pinned_;
}
virtual bool IsValuePinned() const override { return true; }
virtual bool IsValuePinned() const override { return block_contents_pinned_; }
size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
@ -349,6 +356,8 @@ class BlockIter final : public InternalIterator {
BlockReadAmpBitmap* read_amp_bitmap_;
// last `current_` value we report to read-amp bitmp
mutable uint32_t last_bitmap_offset_;
// whether the block data is guaranteed to outlive this iterator
bool block_contents_pinned_;
struct CachedPrevEntry {
explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,

@ -2226,8 +2226,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
s = Status::Corruption(Slice());
}
if (!get_context->SaveValue(parsed_key, biter.value(), &matched,
&biter)) {
if (!get_context->SaveValue(
parsed_key, biter.value(), &matched,
biter.IsValuePinned() ? &biter : nullptr)) {
done = true;
break;
}

Loading…
Cancel
Save