diff --git a/HISTORY.md b/HISTORY.md index 3abe5a79f..ff6d05d5b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,8 @@ * Introduce Periodic Compaction for Level style compaction. Files are re-compacted periodically and put in the same level. ### Public API Change +* Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering. +* Change the behavior of OptimizeForSmallDb(): use a 16MB block cache, put index and filter blocks into it, and cost the memtable size to it. DBOptions.OptimizeForSmallDb() and ColumnFamilyOptions.OptimizeForSmallDb() start to take an optional cache object. ### Bug Fixes * Fix a bug in 2PC where a sequence of txn prepare, memtable flush, and crash could result in losing the prepared transaction. * Fix a bug in Encryption Env which could cause encrypted files to be read beyond file boundaries. diff --git a/db/db_test2.cc b/db/db_test2.cc index dabb34fa9..37382b2b8 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2396,6 +2396,41 @@ TEST_F(DBTest2, OptimizeForPointLookup) { ASSERT_EQ("v1", Get("foo")); } +TEST_F(DBTest2, OptimizeForSmallDB) { + Options options = CurrentOptions(); + Close(); + options.OptimizeForSmallDb(); + + // Find the cache object + ASSERT_EQ(std::string(BlockBasedTableFactory::kName), + std::string(options.table_factory->Name())); + BlockBasedTableOptions* table_options = + reinterpret_cast( + options.table_factory->GetOptions()); + ASSERT_TRUE(table_options != nullptr); + std::shared_ptr cache = table_options->block_cache; + + ASSERT_EQ(0, cache->GetUsage()); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + ASSERT_OK(Put("foo", "v1")); + + // memtable size is costed to the block cache + ASSERT_NE(0, cache->GetUsage()); + + ASSERT_EQ("v1", Get("foo")); + Flush(); + + size_t prev_size = cache->GetUsage(); + // Remember block cache size, so that we can find that + // it is filled after Get(). + // Use pinnable slice so that it can ping the block so that + // when we check the size it is not evicted. + PinnableSlice value; + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value)); + ASSERT_GT(cache->GetUsage(), prev_size); + value.Reset(); +} + #endif // ROCKSDB_LITE TEST_F(DBTest2, GetRaceFlush1) { diff --git a/env/env_encryption.cc b/env/env_encryption.cc index aa59e6635..df1b0011a 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -6,9 +6,9 @@ #ifndef ROCKSDB_LITE #include +#include #include #include -#include #include "rocksdb/env_encryption.h" #include "util/aligned_buffer.h" @@ -897,7 +897,8 @@ Status CTREncryptionProvider::CreateCipherStream( // very large chunk of the file (and very likely read over the bounds) assert(prefix.size() >= 2 * blockSize); if (prefix.size() < 2 * blockSize) { - return Status::Corruption("Unable to read from file " + fname + ": read attempt would read beyond file bounds"); + return Status::Corruption("Unable to read from file " + fname + + ": read attempt would read beyond file bounds"); } // Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1 with initial counter & IV are unencrypted) diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index f7d6dfaf5..c3fc355f3 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -88,7 +88,9 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Some functions that make it easier to optimize RocksDB // Use this if your DB is very small (like under 1GB) and you don't want to // spend lots of memory for memtables. - ColumnFamilyOptions* OptimizeForSmallDb(); + // An optional cache object is passed in to be used as the block cache + ColumnFamilyOptions* OptimizeForSmallDb( + std::shared_ptr* cache = nullptr); // Use this if you don't need to keep the data sorted, i.e. you'll never use // an iterator, only Put() and Get() API calls @@ -349,7 +351,9 @@ struct DBOptions { // Use this if your DB is very small (like under 1GB) and you don't want to // spend lots of memory for memtables. - DBOptions* OptimizeForSmallDb(); + // An optional cache object is passed in for the memory of the + // memtable to cost to + DBOptions* OptimizeForSmallDb(std::shared_ptr* cache = nullptr); #ifndef ROCKSDB_LITE // By default, RocksDB uses only one background thread for flush and diff --git a/options/options.cc b/options/options.cc index aaf8c68ab..bfe3e313d 100644 --- a/options/options.cc +++ b/options/options.cc @@ -413,8 +413,11 @@ Options::PrepareForBulkLoad() } Options* Options::OptimizeForSmallDb() { - ColumnFamilyOptions::OptimizeForSmallDb(); - DBOptions::OptimizeForSmallDb(); + // 16MB block cache + std::shared_ptr cache = NewLRUCache(16 << 20); + + ColumnFamilyOptions::OptimizeForSmallDb(&cache); + DBOptions::OptimizeForSmallDb(&cache); return this; } @@ -469,27 +472,44 @@ ColumnFamilyOptions* ColumnFamilyOptions::OldDefaults( } // Optimization functions -DBOptions* DBOptions::OptimizeForSmallDb() { +DBOptions* DBOptions::OptimizeForSmallDb(std::shared_ptr* cache) { max_file_opening_threads = 1; max_open_files = 5000; + + // Cost memtable to block cache too. + std::shared_ptr wbm = + std::make_shared( + 0, (cache != nullptr) ? *cache : std::shared_ptr()); + write_buffer_manager = wbm; + return this; } -ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb() { +ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb( + std::shared_ptr* cache) { write_buffer_size = 2 << 20; target_file_size_base = 2 * 1048576; max_bytes_for_level_base = 10 * 1048576; soft_pending_compaction_bytes_limit = 256 * 1048576; hard_pending_compaction_bytes_limit = 1073741824ul; + + BlockBasedTableOptions table_options; + table_options.block_cache = + (cache != nullptr) ? *cache : std::shared_ptr(); + table_options.cache_index_and_filter_blocks = true; + // Two level iterator to avoid LRU cache imbalance + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_factory.reset(new BlockBasedTableFactory(table_options)); + + return this; } #ifndef ROCKSDB_LITE ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup( uint64_t block_cache_size_mb) { - prefix_extractor.reset(NewNoopTransform()); BlockBasedTableOptions block_based_options; - block_based_options.index_type = BlockBasedTableOptions::kHashSearch; block_based_options.data_block_index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; block_based_options.data_block_hash_table_util_ratio = 0.75; @@ -498,6 +518,7 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup( NewLRUCache(static_cast(block_cache_size_mb * 1024 * 1024)); table_factory.reset(new BlockBasedTableFactory(block_based_options)); memtable_prefix_bloom_size_ratio = 0.02; + memtable_whole_key_filtering = true; return this; }