From f184bee77badd73e3e62540d1d5ba4ffd8120259 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Fri, 25 Jan 2019 17:07:00 -0800 Subject: [PATCH] PlainTable should avoid copying Get() results from immortal source. (#4924) Summary: https://github.com/facebook/rocksdb/pull/4053 avoids memcopy for Get() results if files are immortable (read-only DB, max_open_files=-1) and the file is ammaped. The same optimization is being applied to PlainTable here. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4924 Differential Revision: D13827749 Pulled By: siying fbshipit-source-id: 1f2cbfc530b40ce08ccd53f95f6e78de4d1c2f96 --- db/plain_table_db_test.cc | 52 ++++++++++++++++++++++++++++++++++++ table/get_context.cc | 2 ++ table/plain_table_factory.cc | 3 ++- table/plain_table_reader.cc | 10 +++++-- table/plain_table_reader.h | 3 ++- 5 files changed, 66 insertions(+), 4 deletions(-) diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index a98b1629a..9bcf1af49 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -158,6 +158,8 @@ class PlainTableDBTest : public testing::Test, db_ = nullptr; } + bool mmap_mode() const { return mmap_mode_; } + void DestroyAndReopen(Options* options = nullptr) { //Destroy using last options Destroy(&last_options_); @@ -174,6 +176,12 @@ class PlainTableDBTest : public testing::Test, return DB::Open(*options, dbname_, db); } + Status ReopenForReadOnly(Options* options) { + delete db_; + db_ = nullptr; + return DB::OpenForReadOnly(*options, dbname_, &db_); + } + Status TryReopen(Options* options = nullptr) { delete db_; db_ = nullptr; @@ -547,6 +555,50 @@ TEST_P(PlainTableDBTest, Flush2) { } } +TEST_P(PlainTableDBTest, Immortable) { + for (EncodingType encoding_type : {kPlain, kPrefix}) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.max_open_files = -1; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + PlainTableOptions plain_table_options; + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.user_key_len = kPlainTableVariableLength; + plain_table_options.bloom_bits_per_key = 10; + plain_table_options.encoding_type = encoding_type; + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + + DestroyAndReopen(&options); + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("1000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + int copied = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "GetContext::SaveValue::PinSelf", [&](void* /*arg*/) { copied++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_EQ("b", Get("0000000000000bar")); + ASSERT_EQ("v1", Get("1000000000000foo")); + ASSERT_EQ(2, copied); + copied = 0; + + Close(); + ASSERT_OK(ReopenForReadOnly(&options)); + + ASSERT_EQ("b", Get("0000000000000bar")); + ASSERT_EQ("v1", Get("1000000000000foo")); + ASSERT_EQ("NOT_FOUND", Get("1000000000000bar")); + if (mmap_mode()) { + ASSERT_EQ(0, copied); + } else { + ASSERT_EQ(2, copied); + } + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } +} + TEST_P(PlainTableDBTest, Iterator) { for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; huge_page_tlb_size += 2 * 1024 * 1024) { diff --git a/table/get_context.cc b/table/get_context.cc index e1d75fe4e..24c9ba7d5 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -221,6 +221,8 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, // If the backing resources for the value are provided, pin them pinnable_val_->PinSlice(value, value_pinner); } else { + TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf", this); + // Otherwise copy the value pinnable_val_->PinSelf(value); } diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc index 273a1bd4f..2c3c90929 100644 --- a/table/plain_table_factory.cc +++ b/table/plain_table_factory.cc @@ -27,7 +27,8 @@ Status PlainTableFactory::NewTableReader( table_reader_options.internal_comparator, std::move(file), file_size, table, table_options_.bloom_bits_per_key, table_options_.hash_table_ratio, table_options_.index_sparseness, table_options_.huge_page_tlb_size, - table_options_.full_scan_mode, table_reader_options.prefix_extractor); + table_options_.full_scan_mode, table_reader_options.immortal, + table_reader_options.prefix_extractor); } TableBuilder* PlainTableFactory::NewTableBuilder( diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index 9dc50a525..f8797ff69 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -120,7 +120,8 @@ Status PlainTableReader::Open( std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table_reader, const int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, - bool full_scan_mode, const SliceTransform* prefix_extractor) { + bool full_scan_mode, const bool immortal_table, + const SliceTransform* prefix_extractor) { if (file_size > PlainTableIndex::kMaxFileSize) { return Status::NotSupported("File is too large for PlainTableReader!"); } @@ -181,6 +182,10 @@ Status PlainTableReader::Open( new_reader->full_scan_mode_ = true; } + if (immortal_table && new_reader->file_info_.is_mmap_mode) { + new_reader->dummy_cleanable_.reset(new Cleanable()); + } + *table_reader = std::move(new_reader); return s; } @@ -598,7 +603,8 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, // can we enable the fast path? if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { bool dont_care __attribute__((__unused__)); - if (!get_context->SaveValue(found_key, found_value, &dont_care)) { + if (!get_context->SaveValue(found_key, found_value, &dont_care, + dummy_cleanable_.get())) { break; } } diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index 5f8248dd7..459418dc7 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -75,7 +75,7 @@ class PlainTableReader: public TableReader { uint64_t file_size, std::unique_ptr* table, const int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, - bool full_scan_mode, + bool full_scan_mode, const bool immortal_table = false, const SliceTransform* prefix_extractor = nullptr); InternalIterator* NewIterator(const ReadOptions&, @@ -157,6 +157,7 @@ class PlainTableReader: public TableReader { CacheAllocationPtr bloom_block_alloc_; const ImmutableCFOptions& ioptions_; + std::unique_ptr dummy_cleanable_; uint64_t file_size_; std::shared_ptr table_properties_;