Change OptimizeForPointLookup() and OptimizeForSmallDb() (#5165)

Summary:
Change the behavior of OptimizeForSmallDb() so that it is less likely to go out of memory.
Change the behavior of OptimizeForPointLookup() to take advantage of the new memtable whole key filter, and move away from prefix extractor as well as hash-based indexing, as they are prone to misuse.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5165

Differential Revision: D14880709

Pulled By: siying

fbshipit-source-id: 9af30e3c9e151eceea6d6b38701a58f1f9fb692d
main
Siying Dong 6 years ago committed by Facebook Github Bot
parent d3d20dcdca
commit ed9f5e21aa
  1. 2
      HISTORY.md
  2. 35
      db/db_test2.cc
  3. 5
      env/env_encryption.cc
  4. 8
      include/rocksdb/options.h
  5. 33
      options/options.cc

@ -6,6 +6,8 @@
* Introduce Periodic Compaction for Level style compaction. Files are re-compacted periodically and put in the same level.
### Public API Change
* Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering.
* Change the behavior of OptimizeForSmallDb(): use a 16MB block cache, put index and filter blocks into it, and cost the memtable size to it. DBOptions.OptimizeForSmallDb() and ColumnFamilyOptions.OptimizeForSmallDb() start to take an optional cache object.
### Bug Fixes
* Fix a bug in 2PC where a sequence of txn prepare, memtable flush, and crash could result in losing the prepared transaction.
* Fix a bug in Encryption Env which could cause encrypted files to be read beyond file boundaries.

@ -2396,6 +2396,41 @@ TEST_F(DBTest2, OptimizeForPointLookup) {
ASSERT_EQ("v1", Get("foo"));
}
TEST_F(DBTest2, OptimizeForSmallDB) {
Options options = CurrentOptions();
Close();
options.OptimizeForSmallDb();
// Find the cache object
ASSERT_EQ(std::string(BlockBasedTableFactory::kName),
std::string(options.table_factory->Name()));
BlockBasedTableOptions* table_options =
reinterpret_cast<BlockBasedTableOptions*>(
options.table_factory->GetOptions());
ASSERT_TRUE(table_options != nullptr);
std::shared_ptr<Cache> cache = table_options->block_cache;
ASSERT_EQ(0, cache->GetUsage());
ASSERT_OK(DB::Open(options, dbname_, &db_));
ASSERT_OK(Put("foo", "v1"));
// memtable size is costed to the block cache
ASSERT_NE(0, cache->GetUsage());
ASSERT_EQ("v1", Get("foo"));
Flush();
size_t prev_size = cache->GetUsage();
// Remember block cache size, so that we can find that
// it is filled after Get().
// Use pinnable slice so that it can ping the block so that
// when we check the size it is not evicted.
PinnableSlice value;
ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value));
ASSERT_GT(cache->GetUsage(), prev_size);
value.Reset();
}
#endif // ROCKSDB_LITE
TEST_F(DBTest2, GetRaceFlush1) {

@ -6,9 +6,9 @@
#ifndef ROCKSDB_LITE
#include <algorithm>
#include <cassert>
#include <cctype>
#include <iostream>
#include <cassert>
#include "rocksdb/env_encryption.h"
#include "util/aligned_buffer.h"
@ -897,7 +897,8 @@ Status CTREncryptionProvider::CreateCipherStream(
// very large chunk of the file (and very likely read over the bounds)
assert(prefix.size() >= 2 * blockSize);
if (prefix.size() < 2 * blockSize) {
return Status::Corruption("Unable to read from file " + fname + ": read attempt would read beyond file bounds");
return Status::Corruption("Unable to read from file " + fname +
": read attempt would read beyond file bounds");
}
// Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1 with initial counter & IV are unencrypted)

@ -88,7 +88,9 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
// Some functions that make it easier to optimize RocksDB
// Use this if your DB is very small (like under 1GB) and you don't want to
// spend lots of memory for memtables.
ColumnFamilyOptions* OptimizeForSmallDb();
// An optional cache object is passed in to be used as the block cache
ColumnFamilyOptions* OptimizeForSmallDb(
std::shared_ptr<Cache>* cache = nullptr);
// Use this if you don't need to keep the data sorted, i.e. you'll never use
// an iterator, only Put() and Get() API calls
@ -349,7 +351,9 @@ struct DBOptions {
// Use this if your DB is very small (like under 1GB) and you don't want to
// spend lots of memory for memtables.
DBOptions* OptimizeForSmallDb();
// An optional cache object is passed in for the memory of the
// memtable to cost to
DBOptions* OptimizeForSmallDb(std::shared_ptr<Cache>* cache = nullptr);
#ifndef ROCKSDB_LITE
// By default, RocksDB uses only one background thread for flush and

@ -413,8 +413,11 @@ Options::PrepareForBulkLoad()
}
Options* Options::OptimizeForSmallDb() {
ColumnFamilyOptions::OptimizeForSmallDb();
DBOptions::OptimizeForSmallDb();
// 16MB block cache
std::shared_ptr<Cache> cache = NewLRUCache(16 << 20);
ColumnFamilyOptions::OptimizeForSmallDb(&cache);
DBOptions::OptimizeForSmallDb(&cache);
return this;
}
@ -469,27 +472,44 @@ ColumnFamilyOptions* ColumnFamilyOptions::OldDefaults(
}
// Optimization functions
DBOptions* DBOptions::OptimizeForSmallDb() {
DBOptions* DBOptions::OptimizeForSmallDb(std::shared_ptr<Cache>* cache) {
max_file_opening_threads = 1;
max_open_files = 5000;
// Cost memtable to block cache too.
std::shared_ptr<rocksdb::WriteBufferManager> wbm =
std::make_shared<rocksdb::WriteBufferManager>(
0, (cache != nullptr) ? *cache : std::shared_ptr<Cache>());
write_buffer_manager = wbm;
return this;
}
ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb() {
ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb(
std::shared_ptr<Cache>* cache) {
write_buffer_size = 2 << 20;
target_file_size_base = 2 * 1048576;
max_bytes_for_level_base = 10 * 1048576;
soft_pending_compaction_bytes_limit = 256 * 1048576;
hard_pending_compaction_bytes_limit = 1073741824ul;
BlockBasedTableOptions table_options;
table_options.block_cache =
(cache != nullptr) ? *cache : std::shared_ptr<Cache>();
table_options.cache_index_and_filter_blocks = true;
// Two level iterator to avoid LRU cache imbalance
table_options.index_type =
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
table_factory.reset(new BlockBasedTableFactory(table_options));
return this;
}
#ifndef ROCKSDB_LITE
ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup(
uint64_t block_cache_size_mb) {
prefix_extractor.reset(NewNoopTransform());
BlockBasedTableOptions block_based_options;
block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
block_based_options.data_block_index_type =
BlockBasedTableOptions::kDataBlockBinaryAndHash;
block_based_options.data_block_hash_table_util_ratio = 0.75;
@ -498,6 +518,7 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup(
NewLRUCache(static_cast<size_t>(block_cache_size_mb * 1024 * 1024));
table_factory.reset(new BlockBasedTableFactory(block_based_options));
memtable_prefix_bloom_size_ratio = 0.02;
memtable_whole_key_filtering = true;
return this;
}

Loading…
Cancel
Save