From ed9f5e21aa21d1bba868c975bf111ac6fb2958df Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 11 Apr 2019 10:22:07 -0700 Subject: [PATCH] Change OptimizeForPointLookup() and OptimizeForSmallDb() (#5165) Summary: Change the behavior of OptimizeForSmallDb() so that it is less likely to go out of memory. Change the behavior of OptimizeForPointLookup() to take advantage of the new memtable whole key filter, and move away from prefix extractor as well as hash-based indexing, as they are prone to misuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5165 Differential Revision: D14880709 Pulled By: siying fbshipit-source-id: 9af30e3c9e151eceea6d6b38701a58f1f9fb692d --- HISTORY.md | 2 ++ db/db_test2.cc | 35 +++++++++++++++++++++++++++++++++++ env/env_encryption.cc | 5 +++-- include/rocksdb/options.h | 8 ++++++-- options/options.cc | 33 +++++++++++++++++++++++++++------ 5 files changed, 73 insertions(+), 10 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 3abe5a79f..ff6d05d5b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,8 @@ * Introduce Periodic Compaction for Level style compaction. Files are re-compacted periodically and put in the same level. ### Public API Change +* Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering. +* Change the behavior of OptimizeForSmallDb(): use a 16MB block cache, put index and filter blocks into it, and cost the memtable size to it. DBOptions.OptimizeForSmallDb() and ColumnFamilyOptions.OptimizeForSmallDb() start to take an optional cache object. ### Bug Fixes * Fix a bug in 2PC where a sequence of txn prepare, memtable flush, and crash could result in losing the prepared transaction. * Fix a bug in Encryption Env which could cause encrypted files to be read beyond file boundaries. diff --git a/db/db_test2.cc b/db/db_test2.cc index dabb34fa9..37382b2b8 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2396,6 +2396,41 @@ TEST_F(DBTest2, OptimizeForPointLookup) { ASSERT_EQ("v1", Get("foo")); } +TEST_F(DBTest2, OptimizeForSmallDB) { + Options options = CurrentOptions(); + Close(); + options.OptimizeForSmallDb(); + + // Find the cache object + ASSERT_EQ(std::string(BlockBasedTableFactory::kName), + std::string(options.table_factory->Name())); + BlockBasedTableOptions* table_options = + reinterpret_cast( + options.table_factory->GetOptions()); + ASSERT_TRUE(table_options != nullptr); + std::shared_ptr cache = table_options->block_cache; + + ASSERT_EQ(0, cache->GetUsage()); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + ASSERT_OK(Put("foo", "v1")); + + // memtable size is costed to the block cache + ASSERT_NE(0, cache->GetUsage()); + + ASSERT_EQ("v1", Get("foo")); + Flush(); + + size_t prev_size = cache->GetUsage(); + // Remember block cache size, so that we can find that + // it is filled after Get(). + // Use pinnable slice so that it can ping the block so that + // when we check the size it is not evicted. + PinnableSlice value; + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value)); + ASSERT_GT(cache->GetUsage(), prev_size); + value.Reset(); +} + #endif // ROCKSDB_LITE TEST_F(DBTest2, GetRaceFlush1) { diff --git a/env/env_encryption.cc b/env/env_encryption.cc index aa59e6635..df1b0011a 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -6,9 +6,9 @@ #ifndef ROCKSDB_LITE #include +#include #include #include -#include #include "rocksdb/env_encryption.h" #include "util/aligned_buffer.h" @@ -897,7 +897,8 @@ Status CTREncryptionProvider::CreateCipherStream( // very large chunk of the file (and very likely read over the bounds) assert(prefix.size() >= 2 * blockSize); if (prefix.size() < 2 * blockSize) { - return Status::Corruption("Unable to read from file " + fname + ": read attempt would read beyond file bounds"); + return Status::Corruption("Unable to read from file " + fname + + ": read attempt would read beyond file bounds"); } // Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1 with initial counter & IV are unencrypted) diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index f7d6dfaf5..c3fc355f3 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -88,7 +88,9 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Some functions that make it easier to optimize RocksDB // Use this if your DB is very small (like under 1GB) and you don't want to // spend lots of memory for memtables. - ColumnFamilyOptions* OptimizeForSmallDb(); + // An optional cache object is passed in to be used as the block cache + ColumnFamilyOptions* OptimizeForSmallDb( + std::shared_ptr* cache = nullptr); // Use this if you don't need to keep the data sorted, i.e. you'll never use // an iterator, only Put() and Get() API calls @@ -349,7 +351,9 @@ struct DBOptions { // Use this if your DB is very small (like under 1GB) and you don't want to // spend lots of memory for memtables. - DBOptions* OptimizeForSmallDb(); + // An optional cache object is passed in for the memory of the + // memtable to cost to + DBOptions* OptimizeForSmallDb(std::shared_ptr* cache = nullptr); #ifndef ROCKSDB_LITE // By default, RocksDB uses only one background thread for flush and diff --git a/options/options.cc b/options/options.cc index aaf8c68ab..bfe3e313d 100644 --- a/options/options.cc +++ b/options/options.cc @@ -413,8 +413,11 @@ Options::PrepareForBulkLoad() } Options* Options::OptimizeForSmallDb() { - ColumnFamilyOptions::OptimizeForSmallDb(); - DBOptions::OptimizeForSmallDb(); + // 16MB block cache + std::shared_ptr cache = NewLRUCache(16 << 20); + + ColumnFamilyOptions::OptimizeForSmallDb(&cache); + DBOptions::OptimizeForSmallDb(&cache); return this; } @@ -469,27 +472,44 @@ ColumnFamilyOptions* ColumnFamilyOptions::OldDefaults( } // Optimization functions -DBOptions* DBOptions::OptimizeForSmallDb() { +DBOptions* DBOptions::OptimizeForSmallDb(std::shared_ptr* cache) { max_file_opening_threads = 1; max_open_files = 5000; + + // Cost memtable to block cache too. + std::shared_ptr wbm = + std::make_shared( + 0, (cache != nullptr) ? *cache : std::shared_ptr()); + write_buffer_manager = wbm; + return this; } -ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb() { +ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb( + std::shared_ptr* cache) { write_buffer_size = 2 << 20; target_file_size_base = 2 * 1048576; max_bytes_for_level_base = 10 * 1048576; soft_pending_compaction_bytes_limit = 256 * 1048576; hard_pending_compaction_bytes_limit = 1073741824ul; + + BlockBasedTableOptions table_options; + table_options.block_cache = + (cache != nullptr) ? *cache : std::shared_ptr(); + table_options.cache_index_and_filter_blocks = true; + // Two level iterator to avoid LRU cache imbalance + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_factory.reset(new BlockBasedTableFactory(table_options)); + + return this; } #ifndef ROCKSDB_LITE ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup( uint64_t block_cache_size_mb) { - prefix_extractor.reset(NewNoopTransform()); BlockBasedTableOptions block_based_options; - block_based_options.index_type = BlockBasedTableOptions::kHashSearch; block_based_options.data_block_index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; block_based_options.data_block_hash_table_util_ratio = 0.75; @@ -498,6 +518,7 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup( NewLRUCache(static_cast(block_cache_size_mb * 1024 * 1024)); table_factory.reset(new BlockBasedTableFactory(block_based_options)); memtable_prefix_bloom_size_ratio = 0.02; + memtable_whole_key_filtering = true; return this; }