From 792762c42c231b037a6dc402511c5e591c9878e0 Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Mon, 18 Apr 2016 09:42:50 -0700 Subject: [PATCH] Split db_test.cc Summary: Split db_test.cc into several files. Moving several helper functions into DBTestBase. Test Plan: make check Reviewers: sdong, yhchiang, IslamAbdelRahman Reviewed By: IslamAbdelRahman Subscribers: dhruba, andrewkr, kradhakrishnan, yhchiang, leveldb, sdong Differential Revision: https://reviews.facebook.net/D56715 --- CMakeLists.txt | 3 + Makefile | 12 + db/db_block_cache_test.cc | 221 +- db/db_bloom_filter_test.cc | 1047 +++ db/db_compaction_test.cc | 28 - db/db_iterator_test.cc | 1319 +++ db/db_sst_test.cc | 1300 +++ db/db_test.cc | 13031 +++++++++------------------ db/db_test2.cc | 5 - db/db_test_util.cc | 30 + db/db_test_util.h | 37 + db/db_universal_compaction_test.cc | 5 - db/db_wal_test.cc | 628 ++ src.mk | 6 +- 14 files changed, 8844 insertions(+), 8828 deletions(-) create mode 100644 db/db_bloom_filter_test.cc create mode 100644 db/db_iterator_test.cc create mode 100644 db/db_sst_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 2490d8208..0de146867 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -342,6 +342,9 @@ set(TESTS db/db_test.cc db/db_test2.cc db/db_block_cache_test.cc + db/db_bloom_filter_test.cc + db/db_iterator_test.cc + db/db_sst_test.cc db/db_universal_compaction_test.cc db/db_wal_test.cc db/dbformat_test.cc diff --git a/Makefile b/Makefile index 8325ba36a..cd5e3a5dd 100644 --- a/Makefile +++ b/Makefile @@ -258,12 +258,15 @@ TESTS = \ db_test \ db_test2 \ db_block_cache_test \ + db_bloom_filter_test \ db_iter_test \ db_log_iter_test \ db_compaction_filter_test \ db_compaction_test \ db_dynamic_level_test \ db_inplace_update_test \ + db_iterator_test \ + db_sst_test \ db_tailing_iter_test \ db_universal_compaction_test \ db_wal_test \ @@ -872,6 +875,9 @@ db_test2: db/db_test2.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) db_block_cache_test: db/db_block_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +db_bloom_filter_test: db/db_bloom_filter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + db_log_iter_test: db/db_log_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -887,6 +893,12 @@ db_dynamic_level_test: db/db_dynamic_level_test.o db/db_test_util.o $(LIBOBJECTS db_inplace_update_test: db/db_inplace_update_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +db_iterator_test: db/db_iterator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +db_sst_test: db/db_sst_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + db_tailing_iter_test: db/db_tailing_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 939cf44e0..1e2831e5b 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -12,11 +12,6 @@ namespace rocksdb { -static uint64_t TestGetTickerCount(const Options& options, - Tickers ticker_type) { - return options.statistics->getTickerCount(ticker_type); -} - class DBBlockCacheTest : public DBTestBase { private: size_t miss_count_ = 0; @@ -229,6 +224,222 @@ TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) { delete iter; iter = nullptr; } + +// Make sure that when options.block_cache is set, after a new table is +// created its index/filter blocks are added to block cache. +TEST_F(DBBlockCacheTest, IndexAndFilterBlocksOfNewTableAddedToCache) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); + + // index/filter blocks added to block cache right after table creation. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, /* only index/filter were added */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + uint64_t int_num; + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + + // Make sure filter block is in cache. + std::string value; + ReadOptions ropt; + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + + // Miss count should remain the same. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Make sure index block is in cache. + auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(index_block_hit + 1, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(index_block_hit + 2, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); +} + +TEST_F(DBBlockCacheTest, ParanoidFileChecks) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + options.level0_file_num_compaction_trigger = 2; + options.paranoid_file_checks = true; + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = false; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "1_key", "val")); + ASSERT_OK(Put(1, "9_key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); + ASSERT_EQ(1, /* read and cache data block */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Put(1, "1_key2", "val2")); + ASSERT_OK(Put(1, "9_key2", "val2")); + // Create a new SST file. This will further trigger a compaction + // and generate another file. + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(3, /* Totally 3 files created up to now */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // After disabling options.paranoid_file_checks. NO further block + // is added after generating a new file. + ASSERT_OK( + dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}})); + + ASSERT_OK(Put(1, "1_key3", "val3")); + ASSERT_OK(Put(1, "9_key3", "val3")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "1_key4", "val4")); + ASSERT_OK(Put(1, "9_key4", "val4")); + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(3, /* Totally 3 files created up to now */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); +} + +TEST_F(DBBlockCacheTest, CompressedCache) { + if (!Snappy_Supported()) { + return; + } + int num_iter = 80; + + // Run this test three iterations. + // Iteration 1: only a uncompressed block cache + // Iteration 2: only a compressed block cache + // Iteration 3: both block cache and compressed cache + // Iteration 4: both block cache and compressed cache, but DB is not + // compressed + for (int iter = 0; iter < 4; iter++) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; // small write buffer + options.statistics = rocksdb::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + switch (iter) { + case 0: + // only uncompressed block cache + table_options.block_cache = NewLRUCache(8 * 1024); + table_options.block_cache_compressed = nullptr; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 1: + // no block cache, only compressed cache + table_options.no_block_cache = true; + table_options.block_cache = nullptr; + table_options.block_cache_compressed = NewLRUCache(8 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 2: + // both compressed and uncompressed block cache + table_options.block_cache = NewLRUCache(1024); + table_options.block_cache_compressed = NewLRUCache(8 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 3: + // both block cache and compressed cache, but DB is not compressed + // also, make block cache sizes bigger, to trigger block cache hits + table_options.block_cache = NewLRUCache(1024 * 1024); + table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.compression = kNoCompression; + break; + default: + ASSERT_TRUE(false); + } + CreateAndReopenWithCF({"pikachu"}, options); + // default column family doesn't have block cache + Options no_block_cache_opts; + no_block_cache_opts.statistics = options.statistics; + no_block_cache_opts = CurrentOptions(no_block_cache_opts); + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + no_block_cache_opts.table_factory.reset( + NewBlockBasedTableFactory(table_options_no_bc)); + ReopenWithColumnFamilies( + {"default", "pikachu"}, + std::vector({no_block_cache_opts, options})); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + std::string str; + for (int i = 0; i < num_iter; i++) { + if (i % 4 == 0) { // high compression ratio + str = RandomString(&rnd, 1000); + } + values.push_back(str); + ASSERT_OK(Put(1, Key(i), values[i])); + } + + // flush all data from memtable so that reads are from block cache + ASSERT_OK(Flush(1)); + + for (int i = 0; i < num_iter; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } + + // check that we triggered the appropriate code paths in the cache + switch (iter) { + case 0: + // only uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 1: + // no block cache, only compressed cache + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 2: + // both compressed and uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 3: + // both compressed and uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + // compressed doesn't have any hits since blocks are not compressed on + // storage + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0); + break; + default: + ASSERT_TRUE(false); + } + + options.create_if_missing = true; + DestroyAndReopen(options); + } +} + #endif } // namespace rocksdb diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc new file mode 100644 index 000000000..b9a86c31f --- /dev/null +++ b/db/db_bloom_filter_test.cc @@ -0,0 +1,1047 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" + +namespace rocksdb { + +// DB tests related to bloom filter. + +class DBBloomFilterTest : public DBTestBase { + public: + DBBloomFilterTest() : DBTestBase("/db_bloom_filter_test") {} +}; + +// KeyMayExist can lead to a few false positives, but not false negatives. +// To make test deterministic, use a much larger number of bits per key-20 than +// bits in the key, so that false positives are eliminated +TEST_F(DBBloomFilterTest, KeyMayExist) { + do { + ReadOptions ropts; + std::string value; + anon::OptionsOverride options_override; + options_override.filter_policy.reset(NewBloomFilterPolicy(20)); + Options options = CurrentOptions(options_override); + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + + ASSERT_OK(Put(1, "a", "b")); + bool value_found = false; + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(value_found); + ASSERT_EQ("b", value); + + ASSERT_OK(Flush(1)); + value.clear(); + + uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); + uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(!value_found); + // assert that no new files were opened and no new blocks were + // read into block cache. + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "a")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Flush(1)); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow trivial move */); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "c")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // KeyMayExist function only checks data in block caches, which is not used + // by plain table format. + } while ( + ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction)); +} + +// A delete is skipped for key if KeyMayExist(key) returns False +// Tests Writebatch consistency and proper delete behaviour +TEST_F(DBBloomFilterTest, FilterDeletes) { + do { + anon::OptionsOverride options_override; + options_override.filter_policy.reset(NewBloomFilterPolicy(20)); + Options options = CurrentOptions(options_override); + options.filter_deletes = true; + CreateAndReopenWithCF({"pikachu"}, options); + WriteBatch batch; + + batch.Delete(handles_[1], "a"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(AllEntriesFor("a", 1), "[ ]"); // Delete skipped + batch.Clear(); + + batch.Put(handles_[1], "a", "b"); + batch.Delete(handles_[1], "a"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(Get(1, "a"), "NOT_FOUND"); + ASSERT_EQ(AllEntriesFor("a", 1), "[ DEL, b ]"); // Delete issued + batch.Clear(); + + batch.Delete(handles_[1], "c"); + batch.Put(handles_[1], "c", "d"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(Get(1, "c"), "d"); + ASSERT_EQ(AllEntriesFor("c", 1), "[ d ]"); // Delete skipped + batch.Clear(); + + ASSERT_OK(Flush(1)); // A stray Flush + + batch.Delete(handles_[1], "c"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(AllEntriesFor("c", 1), "[ DEL, d ]"); // Delete issued + batch.Clear(); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); + + dbfull()->Flush(fo); + + ASSERT_EQ("foo", Get("barbarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("foo2", Get("barbarbar2")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); +} + +TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.statistics = rocksdb::CreateDBStatistics(); + + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + dbfull()->Flush(fo); + + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + // Reopen with whole key filtering enabled and prefix extractor + // NULL. Bloom filter should be off for both of whole key and + // prefix bloom. + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.prefix_extractor.reset(); + Reopen(options); + + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + // Write DB with only full key filtering. + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + // Reopen with both of whole key off and prefix extractor enabled. + // Still no bloom filter should be used. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + // Try to create a DB with mixed files: + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + options.prefix_extractor.reset(); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + // Try to create a DB with mixed files. + ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar")); + // In this case needs insert some keys to make sure files are + // not filtered out by key ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + Flush(); + + // Now we have two files: + // File 1: An older file with prefix bloom. + // File 2: A newer file with whole bloom filter. + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + + // Reopen with the same setting: only whole key is used + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + + // Restart with both filters are allowed + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + // File 1 will has it filtered out. + // File 2 will not, as prefix `foo` exists in the file. + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + + // Restart with only prefix bloom is allowed. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); +} + +TEST_F(DBBloomFilterTest, BloomFilter) { + do { + Options options = CurrentOptions(); + env_->count_random_reads_ = true; + options.env = env_; + // ChangeCompactOptions() only changes compaction style, which does not + // trigger reset of table_factory + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({"pikachu"}, options); + + // Populate multiple layers + const int N = 10000; + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Compact(1, "a", "z"); + for (int i = 0; i < N; i += 100) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Flush(1); + + // Prevent auto compactions triggered by seeks + env_->delay_sstable_sync_.store(true, std::memory_order_release); + + // Lookup present keys. Should rarely read from small sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + int reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d present => %d reads\n", N, reads); + ASSERT_GE(reads, N); + ASSERT_LE(reads, N + 2 * N / 100); + + // Lookup present keys. Should rarely read from either sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing")); + } + reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d missing => %d reads\n", N, reads); + ASSERT_LE(reads, 3 * N / 100); + + env_->delay_sstable_sync_.store(false, std::memory_order_release); + Close(); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBloomFilterTest, BloomFilterRate) { + while (ChangeFilterOptions()) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + // Check if filter is useful + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98); + } +} + +TEST_F(DBBloomFilterTest, BloomFilterCompatibility) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // Create with block based filter + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check db with full filter + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); +} + +TEST_F(DBBloomFilterTest, BloomFilterReverseCompatibility) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // Create with full filter + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check db with block_based filter + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); +} + +namespace { +// A wrapped bloom over default FilterPolicy +class WrappedBloom : public FilterPolicy { + public: + explicit WrappedBloom(int bits_per_key) + : filter_(NewBloomFilterPolicy(bits_per_key)), counter_(0) {} + + ~WrappedBloom() { delete filter_; } + + const char* Name() const override { return "WrappedRocksDbFilterPolicy"; } + + void CreateFilter(const rocksdb::Slice* keys, int n, + std::string* dst) const override { + std::unique_ptr user_keys(new rocksdb::Slice[n]); + for (int i = 0; i < n; ++i) { + user_keys[i] = convertKey(keys[i]); + } + return filter_->CreateFilter(user_keys.get(), n, dst); + } + + bool KeyMayMatch(const rocksdb::Slice& key, + const rocksdb::Slice& filter) const override { + counter_++; + return filter_->KeyMayMatch(convertKey(key), filter); + } + + uint32_t GetCounter() { return counter_; } + + private: + const FilterPolicy* filter_; + mutable uint32_t counter_; + + rocksdb::Slice convertKey(const rocksdb::Slice& key) const { return key; } +}; +} // namespace + +TEST_F(DBBloomFilterTest, BloomFilterWrapper) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + WrappedBloom* policy = new WrappedBloom(10); + table_options.filter_policy.reset(policy); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + ASSERT_EQ(0U, policy->GetCounter()); + Flush(1); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ(1U * maxKey, policy->GetCounter()); + + // Check if filter is useful + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98); + ASSERT_EQ(2U * maxKey, policy->GetCounter()); +} + +class SliceTransformLimitedDomain : public SliceTransform { + const char* Name() const override { return "SliceTransformLimitedDomain"; } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), 5); + } + + bool InDomain(const Slice& src) const override { + // prefix will be x???? + return src.size() >= 5 && src[0] == 'x'; + } + + bool InRange(const Slice& dst) const override { + // prefix will be x???? + return dst.size() == 5 && dst[0] == 'x'; + } +}; + +TEST_F(DBBloomFilterTest, PrefixExtractorFullFilter) { + BlockBasedTableOptions bbto; + // Full Filter Block + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1111_AAAA", "val1")); + ASSERT_OK(Put("x1112_AAAA", "val2")); + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val5")); + + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("x1111_AAAA"), "val1"); + ASSERT_EQ(Get("x1112_AAAA"), "val2"); + ASSERT_EQ(Get("x1113_AAAA"), "val3"); + ASSERT_EQ(Get("x1114_AAAA"), "val4"); + // Was not added to filter but rocksdb will try to read it from the filter + ASSERT_EQ(Get("zzzzz_AAAA"), "val5"); +} + +TEST_F(DBBloomFilterTest, PrefixExtractorBlockFilter) { + BlockBasedTableOptions bbto; + // Block Filter Block + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true)); + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val1")); + ASSERT_OK(Put("zzzzz_AAAB", "val2")); + ASSERT_OK(Put("zzzzz_AAAC", "val3")); + ASSERT_OK(Put("zzzzz_AAAD", "val4")); + + ASSERT_OK(Flush()); + + std::vector iter_res; + auto iter = db_->NewIterator(ReadOptions()); + // Seek to a key that was not in Domain + for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) { + iter_res.emplace_back(iter->value().ToString()); + } + + std::vector expected_res = {"val1", "val2", "val3", "val4"}; + ASSERT_EQ(iter_res, expected_res); + delete iter; +} + +#ifndef ROCKSDB_LITE +class BloomStatsTestWithParam + : public DBBloomFilterTest, + public testing::WithParamInterface> { + public: + BloomStatsTestWithParam() { + use_block_table_ = std::get<0>(GetParam()); + use_block_based_builder_ = std::get<1>(GetParam()); + + options_.create_if_missing = true; + options_.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(4)); + options_.memtable_prefix_bloom_bits = 8 * 1024; + if (use_block_table_) { + BlockBasedTableOptions table_options; + table_options.hash_index_allow_collision = false; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10, use_block_based_builder_)); + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } else { + PlainTableOptions table_options; + options_.table_factory.reset(NewPlainTableFactory(table_options)); + } + + perf_context.Reset(); + DestroyAndReopen(options_); + } + + ~BloomStatsTestWithParam() { + perf_context.Reset(); + Destroy(options_); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + bool use_block_table_; + bool use_block_based_builder_; + Options options_; +}; + +// 1 Insert 2 K-V pairs into DB +// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2 +// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1 +// 4 Call Flush() to create SST +// 5 Call Get() for both keys - expext SST bloom hit stat to be 2 +// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1 +// Test both: block and plain SST +TEST_P(BloomStatsTestWithParam, BloomStatsTest) { + std::string key1("AAAA"); + std::string key2("RXDB"); // not in DB + std::string key3("ZBRA"); + std::string value1("Value1"); + std::string value3("Value3"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key3, value3, WriteOptions())); + + // check memtable bloom stats + ASSERT_EQ(value1, Get(key1)); + ASSERT_EQ(1, perf_context.bloom_memtable_hit_count); + ASSERT_EQ(value3, Get(key3)); + ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); + ASSERT_EQ(0, perf_context.bloom_memtable_miss_count); + + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(1, perf_context.bloom_memtable_miss_count); + ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); + + // sanity checks + ASSERT_EQ(0, perf_context.bloom_sst_hit_count); + ASSERT_EQ(0, perf_context.bloom_sst_miss_count); + + Flush(); + + // sanity checks + ASSERT_EQ(0, perf_context.bloom_sst_hit_count); + ASSERT_EQ(0, perf_context.bloom_sst_miss_count); + + // check SST bloom stats + // NOTE: hits per get differs because of code paths differences + // in BlockBasedTable::Get() + int hits_per_get = use_block_table_ && !use_block_based_builder_ ? 2 : 1; + ASSERT_EQ(value1, Get(key1)); + ASSERT_EQ(hits_per_get, perf_context.bloom_sst_hit_count); + ASSERT_EQ(value3, Get(key3)); + ASSERT_EQ(2 * hits_per_get, perf_context.bloom_sst_hit_count); + + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(1, perf_context.bloom_sst_miss_count); +} + +// Same scenario as in BloomStatsTest but using an iterator +TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { + std::string key1("AAAA"); + std::string key2("RXDB"); // not in DB + std::string key3("ZBRA"); + std::string value1("Value1"); + std::string value3("Value3"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key3, value3, WriteOptions())); + + unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + + // check memtable bloom stats + iter->Seek(key1); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value1, iter->value().ToString()); + ASSERT_EQ(1, perf_context.bloom_memtable_hit_count); + ASSERT_EQ(0, perf_context.bloom_memtable_miss_count); + + iter->Seek(key3); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value3, iter->value().ToString()); + ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); + ASSERT_EQ(0, perf_context.bloom_memtable_miss_count); + + iter->Seek(key2); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(1, perf_context.bloom_memtable_miss_count); + ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); + + Flush(); + + iter.reset(dbfull()->NewIterator(ReadOptions())); + + // Check SST bloom stats + iter->Seek(key1); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value1, iter->value().ToString()); + ASSERT_EQ(1, perf_context.bloom_sst_hit_count); + + iter->Seek(key3); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value3, iter->value().ToString()); + ASSERT_EQ(2, perf_context.bloom_sst_hit_count); + + iter->Seek(key2); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(1, perf_context.bloom_sst_miss_count); + ASSERT_EQ(2, perf_context.bloom_sst_hit_count); +} + +INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam, + ::testing::Values(std::make_tuple(true, true), + std::make_tuple(true, false), + std::make_tuple(false, false))); + +namespace { +void PrefixScanInit(DBBloomFilterTest* dbtest) { + char buf[100]; + std::string keystr; + const int small_range_sstfiles = 5; + const int big_range_sstfiles = 5; + + // Generate 11 sst files with the following prefix ranges. + // GROUP 0: [0,10] (level 1) + // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0) + // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0) + // + // A seek with the previous API would do 11 random I/Os (to all the + // files). With the new API and a prefix filter enabled, we should + // only do 2 random I/O, to the 2 files containing the key. + + // GROUP 0 + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", 10); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr, + nullptr); // move to level 1 + + // GROUP 1 + for (int i = 1; i <= small_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", i); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", i + 1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } + + // GROUP 2 + for (int i = 1; i <= big_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", small_range_sstfiles + i + 1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } +} +} // namespace + +TEST_F(DBBloomFilterTest, PrefixScan) { + XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, + kSkipNoPrefix); + while (ChangeFilterOptions()) { + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + ASSERT_EQ(key.difference_offset(prefix), 8); + ASSERT_EQ(prefix.difference_offset(key), 8); + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + table_options.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // 11 RAND I/Os + DestroyAndReopen(options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (!iter->key().starts_with(prefix)) { + break; + } + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + Close(); + } // end of while + XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, 0); +} + +TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; + options.arena_block_size = 4 * 1024; + options.target_file_size_base = 64 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.max_bytes_for_level_base = 256 * 1024; + options.max_write_buffer_number = 2; + options.max_background_compactions = 8; + options.max_background_flushes = 8; + options.compression = kNoCompression; + options.compaction_style = kCompactionStyleLevel; + options.level_compaction_dynamic_level_bytes = true; + BlockBasedTableOptions bbto; + bbto.cache_index_and_filter_blocks = true; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, true)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.optimize_filters_for_hits = true; + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"mypikachu"}, options); + + int numkeys = 200000; + + // Generate randomly shuffled keys, so the updates are almost + // random. + std::vector keys; + keys.reserve(numkeys); + for (int i = 0; i < numkeys; i += 2) { + keys.push_back(i); + } + std::random_shuffle(std::begin(keys), std::end(keys)); + + int num_inserted = 0; + for (int key : keys) { + ASSERT_OK(Put(1, Key(key), "val")); + if (++num_inserted % 1000 == 0) { + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + } + ASSERT_OK(Put(1, Key(0), "val")); + ASSERT_OK(Put(1, Key(numkeys), "val")); + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + if (NumTableFilesAtLevel(0, 1) == 0) { + // No Level 0 file. Create one. + ASSERT_OK(Put(1, Key(0), "val")); + ASSERT_OK(Put(1, Key(numkeys), "val")); + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + } + + for (int i = 1; i < numkeys; i += 2) { + ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND"); + } + + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + + // Now we have three sorted run, L0, L5 and L6 with most files in L6 have + // no bloom filter. Most keys be checked bloom filters twice. + ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2); + ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2); + + for (int i = 0; i < numkeys; i += 2) { + ASSERT_EQ(Get(1, Key(i)), "val"); + } + + // Part 2 (read path): rewrite last level with blooms, then verify they get + // cached only if !optimize_filters_for_hits + options.disable_auto_compactions = true; + options.num_levels = 9; + options.optimize_filters_for_hits = false; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + MoveFilesToLevel(7 /* level */, 1 /* column family index */); + + std::string value = Get(1, Key(0)); + uint64_t prev_cache_filter_hits = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + value = Get(1, Key(0)); + ASSERT_EQ(prev_cache_filter_hits + 1, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Now that we know the filter blocks exist in the last level files, see if + // filter caching is skipped for this optimization + options.optimize_filters_for_hits = true; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + value = Get(1, Key(0)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(2 /* index and data block */, + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // Check filter block ignored for files preloaded during DB::Open() + options.max_open_files = -1; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + uint64_t prev_cache_filter_misses = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + Get(1, Key(0)); + ASSERT_EQ(prev_cache_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(prev_cache_filter_hits, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Check filter block ignored for file trivially-moved to bottom level + bbto.block_cache.reset(); + options.max_open_files = 100; // setting > -1 makes it not preload all files + options.statistics = CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + ASSERT_OK(Put(1, Key(numkeys + 1), "val")); + ASSERT_OK(Flush(1)); + + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* arg) { trivial_move++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* arg) { non_trivial_move++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + CompactRangeOptions compact_options; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kSkip; + compact_options.change_level = true; + compact_options.target_level = 7; + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + prev_cache_filter_misses = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + value = Get(1, Key(numkeys + 1)); + ASSERT_EQ(prev_cache_filter_hits, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(prev_cache_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + + // Check filter block not cached for iterator + bbto.block_cache.reset(); + options.statistics = CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + std::unique_ptr iter(db_->NewIterator(ReadOptions(), handles_[1])); + iter->SeekToFirst(); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(2 /* index and data block */, + TestGetTickerCount(options, BLOCK_CACHE_ADD)); +} + +#endif // ROCKSDB_LITE + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 677cacb22..c794407e9 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -40,34 +40,6 @@ class DBCompactionTestWithParam }; namespace { -class OnFileDeletionListener : public EventListener { - public: - OnFileDeletionListener() : - matched_count_(0), - expected_file_name_("") {} - - void SetExpectedFileName( - const std::string file_name) { - expected_file_name_ = file_name; - } - - void VerifyMatchedCount(size_t expected_value) { - ASSERT_EQ(matched_count_, expected_value); - } - - void OnTableFileDeleted( - const TableFileDeletionInfo& info) override { - if (expected_file_name_ != "") { - ASSERT_EQ(expected_file_name_, info.file_path); - expected_file_name_ = ""; - matched_count_++; - } - } - - private: - size_t matched_count_; - std::string expected_file_name_; -}; class FlushedFileCollector : public EventListener { public: diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc new file mode 100644 index 000000000..94bdbcd13 --- /dev/null +++ b/db/db_iterator_test.cc @@ -0,0 +1,1319 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" + +namespace rocksdb { + +class DBIteratorTest : public DBTestBase { + public: + DBIteratorTest() : DBTestBase("/db_iterator_test") {} +}; + +TEST_F(DBIteratorTest, IteratorProperty) { + // The test needs to be changed if kPersistedTier is supported in iterator. + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + Put(1, "1", "2"); + ReadOptions ropt; + ropt.pin_data = false; + { + unique_ptr iter(db_->NewIterator(ropt, handles_[1])); + iter->SeekToFirst(); + std::string prop_value; + ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value)); + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("0", prop_value); + iter->Next(); + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("Iterator is not valid.", prop_value); + } + Close(); +} + +TEST_F(DBIteratorTest, PersistedTierOnIterator) { + // The test needs to be changed if kPersistedTier is supported in iterator. + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ReadOptions ropt; + ropt.read_tier = kPersistedTier; + + auto* iter = db_->NewIterator(ropt, handles_[1]); + ASSERT_TRUE(iter->status().IsNotSupported()); + delete iter; + + std::vector iters; + ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported()); + Close(); +} + +TEST_F(DBIteratorTest, NonBlockingIteration) { + do { + ReadOptions non_blocking_opts, regular_opts; + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + non_blocking_opts.read_tier = kBlockCacheTier; + CreateAndReopenWithCF({"pikachu"}, options); + // write one kv to the database. + ASSERT_OK(Put(1, "a", "b")); + + // scan using non-blocking iterator. We should find it because + // it is in memtable. + Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + delete iter; + + // flush memtable to storage. Now, the key should not be in the + // memtable neither in the block cache. + ASSERT_OK(Flush(1)); + + // verify that a non-blocking iterator does not find any + // kvs. Neither does it do any IOs to storage. + uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); + uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + count++; + } + ASSERT_EQ(count, 0); + ASSERT_TRUE(iter->status().IsIncomplete()); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // read in the specified block via a regular get + ASSERT_EQ(Get(1, "a"), "b"); + + // verify that we can find it via a non-blocking scan + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // This test verifies block cache behaviors, which is not used by plain + // table format. + // Exclude kHashCuckoo as it does not support iteration currently + } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo | + kSkipMmapReads)); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBIteratorTest, ManagedNonBlockingIteration) { + do { + ReadOptions non_blocking_opts, regular_opts; + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + non_blocking_opts.read_tier = kBlockCacheTier; + non_blocking_opts.managed = true; + CreateAndReopenWithCF({"pikachu"}, options); + // write one kv to the database. + ASSERT_OK(Put(1, "a", "b")); + + // scan using non-blocking iterator. We should find it because + // it is in memtable. + Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + delete iter; + + // flush memtable to storage. Now, the key should not be in the + // memtable neither in the block cache. + ASSERT_OK(Flush(1)); + + // verify that a non-blocking iterator does not find any + // kvs. Neither does it do any IOs to storage. + int64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); + int64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + count++; + } + ASSERT_EQ(count, 0); + ASSERT_TRUE(iter->status().IsIncomplete()); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // read in the specified block via a regular get + ASSERT_EQ(Get(1, "a"), "b"); + + // verify that we can find it via a non-blocking scan + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // This test verifies block cache behaviors, which is not used by plain + // table format. + // Exclude kHashCuckoo as it does not support iteration currently + } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo | + kSkipMmapReads)); +} +#endif // ROCKSDB_LITE + +TEST_F(DBIteratorTest, IterSeekBeforePrev) { + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("0", "f")); + ASSERT_OK(Put("1", "h")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("2", "j")); + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek(Slice("c")); + iter->Prev(); + iter->Seek(Slice("a")); + iter->Prev(); + delete iter; +} + +namespace { +std::string MakeLongKey(size_t length, char c) { + return std::string(length, c); +} +} // namespace + +TEST_F(DBIteratorTest, IterLongKeys) { + ASSERT_OK(Put(MakeLongKey(20, 0), "0")); + ASSERT_OK(Put(MakeLongKey(32, 2), "2")); + ASSERT_OK(Put("a", "b")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put(MakeLongKey(50, 1), "1")); + ASSERT_OK(Put(MakeLongKey(127, 3), "3")); + ASSERT_OK(Put(MakeLongKey(64, 4), "4")); + auto iter = db_->NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + iter->Seek(MakeLongKey(20, 0)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4"); + delete iter; + + iter = db_->NewIterator(ReadOptions()); + iter->Seek(MakeLongKey(50, 1)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + delete iter; +} + +TEST_F(DBIteratorTest, IterNextWithNewerSeq) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = db_->NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Seek(Slice("a")); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->d"); + delete iter; +} + +TEST_F(DBIteratorTest, IterPrevWithNewerSeq) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = db_->NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Seek(Slice("d")); + ASSERT_EQ(IterStatus(iter), "d->e"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->d"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + + iter->Prev(); + delete iter; +} + +TEST_F(DBIteratorTest, IterPrevWithNewerSeq2) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek(Slice("c")); + ASSERT_EQ(IterStatus(iter), "c->d"); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + + iter->Prev(); + delete iter; +} + +TEST_F(DBIteratorTest, IterEmpty) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_F(DBIteratorTest, IterSingle) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "va")); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_F(DBIteratorTest, IterMulti) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "va")); + ASSERT_OK(Put(1, "b", "vb")); + ASSERT_OK(Put(1, "c", "vc")); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("z"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + iter->Prev(); + iter->Prev(); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Switch from forward to reverse + iter->SeekToFirst(); + iter->Next(); + iter->Next(); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Make sure iter stays at snapshot + ASSERT_OK(Put(1, "a", "va2")); + ASSERT_OK(Put(1, "a2", "va3")); + ASSERT_OK(Put(1, "b", "vb2")); + ASSERT_OK(Put(1, "c", "vc2")); + ASSERT_OK(Delete(1, "b")); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +// Check that we can skip over a run of user keys +// by using reseek rather than sequential scan +TEST_F(DBIteratorTest, IterReseek) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + Options options = CurrentOptions(options_override); + options.max_sequential_skip_in_iterations = 3; + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // insert three keys with same userkey and verify that + // reseek is not invoked. For each of these test cases, + // verify that we can find the next key "b". + ASSERT_OK(Put(1, "a", "zero")); + ASSERT_OK(Put(1, "a", "one")); + ASSERT_OK(Put(1, "a", "two")); + ASSERT_OK(Put(1, "b", "bone")); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "a->two"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of three keys with same userkey and verify + // that reseek is still not invoked. + ASSERT_OK(Put(1, "a", "three")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->three"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of four keys with same userkey and verify + // that reseek is invoked. + ASSERT_OK(Put(1, "a", "four")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->four"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // Testing reverse iterator + // At this point, we have three versions of "a" and one version of "b". + // The reseek statistics is already at 1. + int num_reseeks = static_cast( + TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION)); + + // Insert another version of b and assert that reseek is not invoked + ASSERT_OK(Put(1, "b", "btwo")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->btwo"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 1); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; + + // insert two more versions of b. This makes a total of 4 versions + // of b and 4 versions of a. + ASSERT_OK(Put(1, "b", "bthree")); + ASSERT_OK(Put(1, "b", "bfour")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->bfour"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 2); + iter->Prev(); + + // the previous Prev call should have invoked reseek + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 3); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; +} + +TEST_F(DBIteratorTest, IterSmallAndLargeMix) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "va")); + ASSERT_OK(Put(1, "b", std::string(100000, 'b'))); + ASSERT_OK(Put(1, "c", "vc")); + ASSERT_OK(Put(1, "d", std::string(100000, 'd'))); + ASSERT_OK(Put(1, "e", std::string(100000, 'e'))); + + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_F(DBIteratorTest, IterMultiWithDelete) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "ka", "va")); + ASSERT_OK(Put(1, "kb", "vb")); + ASSERT_OK(Put(1, "kc", "vc")); + ASSERT_OK(Delete(1, "kb")); + ASSERT_EQ("NOT_FOUND", Get(1, "kb")); + + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->Seek("kc"); + ASSERT_EQ(IterStatus(iter), "kc->vc"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + if (kPlainTableAllBytesPrefix != option_config_ && + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_) { + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "ka->va"); + } + } + delete iter; + } while (ChangeOptions()); +} + +TEST_F(DBIteratorTest, IterPrevMaxSkip) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + for (int i = 0; i < 2; i++) { + ASSERT_OK(Put(1, "key1", "v1")); + ASSERT_OK(Put(1, "key2", "v2")); + ASSERT_OK(Put(1, "key3", "v3")); + ASSERT_OK(Put(1, "key4", "v4")); + ASSERT_OK(Put(1, "key5", "v5")); + } + + VerifyIterLast("key5->v5", 1); + + ASSERT_OK(Delete(1, "key5")); + VerifyIterLast("key4->v4", 1); + + ASSERT_OK(Delete(1, "key4")); + VerifyIterLast("key3->v3", 1); + + ASSERT_OK(Delete(1, "key3")); + VerifyIterLast("key2->v2", 1); + + ASSERT_OK(Delete(1, "key2")); + VerifyIterLast("key1->v1", 1); + + ASSERT_OK(Delete(1, "key1")); + VerifyIterLast("(invalid)", 1); + } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast)); +} + +TEST_F(DBIteratorTest, IterWithSnapshot) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); + ASSERT_OK(Put(1, "key1", "val1")); + ASSERT_OK(Put(1, "key2", "val2")); + ASSERT_OK(Put(1, "key3", "val3")); + ASSERT_OK(Put(1, "key4", "val4")); + ASSERT_OK(Put(1, "key5", "val5")); + + const Snapshot* snapshot = db_->GetSnapshot(); + ReadOptions options; + options.snapshot = snapshot; + Iterator* iter = db_->NewIterator(options, handles_[1]); + + // Put more values after the snapshot + ASSERT_OK(Put(1, "key100", "val100")); + ASSERT_OK(Put(1, "key101", "val101")); + + iter->Seek("key5"); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + if (kPlainTableAllBytesPrefix != option_config_ && + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_) { + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key3->val3"); + + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + } + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + } + db_->ReleaseSnapshot(snapshot); + delete iter; + // skip as HashCuckooRep does not support snapshot + } while (ChangeOptions(kSkipHashCuckoo)); +} + +TEST_F(DBIteratorTest, IteratorPinsRef) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + Put(1, "foo", "hello"); + + // Get iterator that will yield the current contents of the DB. + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + // Write to force compactions + Put(1, "foo", "newvalue1"); + for (int i = 0; i < 100; i++) { + // 100K values + ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v'))); + } + Put(1, "foo", "newvalue2"); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("hello", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_F(DBIteratorTest, DBIteratorBoundTest) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + + options.prefix_extractor = nullptr; + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing basic case with no iterate_upper_bound and no prefix_extractor + { + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("g1")), 0); + } + + // testing iterate_upper_bound and forward iterator + // to make sure it stops at bound + { + ReadOptions ro; + // iterate_upper_bound points beyond the last expected entry + Slice prefix("foo2"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("foo1")), 0); + + iter->Next(); + // should stop here... + ASSERT_TRUE(!iter->Valid()); + } + // Testing SeekToLast with iterate_upper_bound set + { + ReadOptions ro; + + Slice prefix("foo"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("a")), 0); + } + + // prefix is the first letter of the key + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing with iterate_upper_bound and prefix_extractor + // Seek target and iterate_upper_bound are not is same prefix + // This should be an error + { + ReadOptions ro; + Slice upper_bound("g"); + ro.iterate_upper_bound = &upper_bound; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo1", iter->key().ToString()); + + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + } + + // testing that iterate_upper_bound prevents iterating over deleted items + // if the bound has already reached + { + options.prefix_extractor = nullptr; + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("b", "0")); + ASSERT_OK(Put("b1", "0")); + ASSERT_OK(Put("c", "0")); + ASSERT_OK(Put("d", "0")); + ASSERT_OK(Put("e", "0")); + ASSERT_OK(Delete("c")); + ASSERT_OK(Delete("d")); + + // base case with no bound + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + perf_context.Reset(); + iter->Next(); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 2); + + // now testing with iterate_bound + Slice prefix("c"); + ro.iterate_upper_bound = &prefix; + + iter.reset(db_->NewIterator(ro)); + + perf_context.Reset(); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + iter->Next(); + // the iteration should stop as soon as the the bound key is reached + // even though the key is deleted + // hence internal_delete_skipped_count should be 0 + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 0); + } +} + +// TODO(3.13): fix the issue of Seek() + Prev() which might not necessary +// return the biggest key which is smaller than the seek key. +TEST_F(DBIteratorTest, PrevAfterMerge) { + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreatePutOperator(); + DestroyAndReopen(options); + + // write three entries with different keys using Merge() + WriteOptions wopts; + db_->Merge(wopts, "1", "data1"); + db_->Merge(wopts, "2", "data2"); + db_->Merge(wopts, "3", "data3"); + + std::unique_ptr it(db_->NewIterator(ReadOptions())); + + it->Seek("2"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("2", it->key().ToString()); + + it->Prev(); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("1", it->key().ToString()); +} + +TEST_F(DBIteratorTest, PinnedDataIteratorRandomized) { + enum TestConfig { + NORMAL, + CLOSE_AND_OPEN, + COMPACT_BEFORE_READ, + FLUSH_EVERY_1000, + MAX + }; + + // Generate Random data + Random rnd(301); + + int puts = 100000; + int key_pool = static_cast(puts * 0.7); + int key_size = 100; + int val_size = 1000; + int seeks_percentage = 20; // 20% of keys will be used to test seek() + int delete_percentage = 20; // 20% of keys will be deleted + int merge_percentage = 20; // 20% of keys will be added using Merge() + + for (int run_config = 0; run_config < TestConfig::MAX; run_config++) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreatePutOperator(); + DestroyAndReopen(options); + + std::vector generated_keys(key_pool); + for (int i = 0; i < key_pool; i++) { + generated_keys[i] = RandomString(&rnd, key_size); + } + + std::map true_data; + std::vector random_keys; + std::vector deleted_keys; + for (int i = 0; i < puts; i++) { + auto& k = generated_keys[rnd.Next() % key_pool]; + auto v = RandomString(&rnd, val_size); + + // Insert data to true_data map and to DB + true_data[k] = v; + if (rnd.OneIn(static_cast(100.0 / merge_percentage))) { + ASSERT_OK(db_->Merge(WriteOptions(), k, v)); + } else { + ASSERT_OK(Put(k, v)); + } + + // Pick random keys to be used to test Seek() + if (rnd.OneIn(static_cast(100.0 / seeks_percentage))) { + random_keys.push_back(k); + } + + // Delete some random keys + if (rnd.OneIn(static_cast(100.0 / delete_percentage))) { + deleted_keys.push_back(k); + true_data.erase(k); + ASSERT_OK(Delete(k)); + } + + if (run_config == TestConfig::FLUSH_EVERY_1000) { + if (i && i % 1000 == 0) { + Flush(); + } + } + } + + if (run_config == TestConfig::CLOSE_AND_OPEN) { + Close(); + Reopen(options); + } else if (run_config == TestConfig::COMPACT_BEFORE_READ) { + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } + + ReadOptions ro; + ro.pin_data = true; + auto iter = db_->NewIterator(ro); + + { + // Test Seek to random keys + printf("Testing seek on %zu keys\n", random_keys.size()); + std::vector keys_slices; + std::vector true_keys; + for (auto& k : random_keys) { + iter->Seek(k); + if (!iter->Valid()) { + ASSERT_EQ(true_data.lower_bound(k), true_data.end()); + continue; + } + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + keys_slices.push_back(iter->key()); + true_keys.push_back(true_data.lower_bound(k)->first); + } + + for (size_t i = 0; i < keys_slices.size(); i++) { + ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]); + } + } + + { + // Test iterating all data forward + printf("Testing iterating forward on all keys\n"); + std::vector all_keys; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + all_keys.push_back(iter->key()); + } + ASSERT_EQ(all_keys.size(), true_data.size()); + + // Verify that all keys slices are valid + auto data_iter = true_data.begin(); + for (size_t i = 0; i < all_keys.size(); i++) { + ASSERT_EQ(all_keys[i].ToString(), data_iter->first); + data_iter++; + } + } + + { + // Test iterating all data backward + printf("Testing iterating backward on all keys\n"); + std::vector all_keys; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + all_keys.push_back(iter->key()); + } + ASSERT_EQ(all_keys.size(), true_data.size()); + + // Verify that all keys slices are valid (backward) + auto data_iter = true_data.rbegin(); + for (size_t i = 0; i < all_keys.size(); i++) { + ASSERT_EQ(all_keys[i].ToString(), data_iter->first); + data_iter++; + } + } + + delete iter; + } +} + +#ifndef ROCKSDB_LITE +TEST_F(DBIteratorTest, PinnedDataIteratorMultipleFiles) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.disable_auto_compactions = true; + options.write_buffer_size = 1024 * 1024 * 10; // 10 Mb + DestroyAndReopen(options); + + std::map true_data; + + // Generate 4 sst files in L2 + Random rnd(301); + for (int i = 1; i <= 1000; i++) { + std::string k = Key(i * 3); + std::string v = RandomString(&rnd, 100); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + if (i % 250 == 0) { + ASSERT_OK(Flush()); + } + } + ASSERT_EQ(FilesPerLevel(0), "4"); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(FilesPerLevel(0), "0,4"); + + // Generate 4 sst files in L0 + for (int i = 1; i <= 1000; i++) { + std::string k = Key(i * 2); + std::string v = RandomString(&rnd, 100); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + if (i % 250 == 0) { + ASSERT_OK(Flush()); + } + } + ASSERT_EQ(FilesPerLevel(0), "4,4"); + + // Add some keys/values in memtables + for (int i = 1; i <= 1000; i++) { + std::string k = Key(i); + std::string v = RandomString(&rnd, 100); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + } + ASSERT_EQ(FilesPerLevel(0), "4,4"); + + ReadOptions ro; + ro.pin_data = true; + auto iter = db_->NewIterator(ro); + + std::vector> results; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + results.emplace_back(iter->key(), iter->value().ToString()); + } + + ASSERT_EQ(results.size(), true_data.size()); + auto data_iter = true_data.begin(); + for (size_t i = 0; i < results.size(); i++, data_iter++) { + auto& kv = results[i]; + ASSERT_EQ(kv.first, data_iter->first); + ASSERT_EQ(kv.second, data_iter->second); + } + + delete iter; +} +#endif + +TEST_F(DBIteratorTest, PinnedDataIteratorMergeOperator) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + DestroyAndReopen(options); + + std::string numbers[7]; + for (int val = 0; val <= 6; val++) { + PutFixed64(numbers + val, val); + } + + // +1 all keys in range [ 0 => 999] + for (int i = 0; i < 1000; i++) { + WriteOptions wo; + ASSERT_OK(db_->Merge(wo, Key(i), numbers[1])); + } + + // +2 all keys divisible by 2 in range [ 0 => 999] + for (int i = 0; i < 1000; i += 2) { + WriteOptions wo; + ASSERT_OK(db_->Merge(wo, Key(i), numbers[2])); + } + + // +3 all keys divisible by 5 in range [ 0 => 999] + for (int i = 0; i < 1000; i += 5) { + WriteOptions wo; + ASSERT_OK(db_->Merge(wo, Key(i), numbers[3])); + } + + ReadOptions ro; + ro.pin_data = true; + auto iter = db_->NewIterator(ro); + + std::vector> results; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + results.emplace_back(iter->key(), iter->value().ToString()); + } + + ASSERT_EQ(results.size(), 1000); + for (size_t i = 0; i < results.size(); i++) { + auto& kv = results[i]; + ASSERT_EQ(kv.first, Key(static_cast(i))); + int expected_val = 1; + if (i % 2 == 0) { + expected_val += 2; + } + if (i % 5 == 0) { + expected_val += 3; + } + ASSERT_EQ(kv.second, numbers[expected_val]); + } + + delete iter; +} + +TEST_F(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.write_buffer_size = 100000; + DestroyAndReopen(options); + + Random rnd(301); + + std::map true_data; + for (int i = 0; i < 1000; i++) { + std::string k = RandomString(&rnd, 10); + std::string v = RandomString(&rnd, 1000); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + } + + ReadOptions ro; + ro.pin_data = true; + auto iter = db_->NewIterator(ro); + + // Delete 50% of the keys and update the other 50% + for (auto& kv : true_data) { + if (rnd.OneIn(2)) { + ASSERT_OK(Delete(kv.first)); + } else { + std::string new_val = RandomString(&rnd, 1000); + ASSERT_OK(Put(kv.first, new_val)); + } + } + + std::vector> results; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + results.emplace_back(iter->key(), iter->value().ToString()); + } + + auto data_iter = true_data.begin(); + for (size_t i = 0; i < results.size(); i++, data_iter++) { + auto& kv = results[i]; + ASSERT_EQ(kv.first, data_iter->first); + ASSERT_EQ(kv.second, data_iter->second); + } + + delete iter; +} + +TEST_F(DBIteratorTest, IteratorWithLocalStatistics) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 1000; i++) { + // Key 10 bytes / Value 10 bytes + ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10))); + } + + std::atomic total_next(0); + std::atomic total_next_found(0); + std::atomic total_prev(0); + std::atomic total_prev_found(0); + std::atomic total_bytes(0); + + std::vector threads; + std::function reader_func_next = [&]() { + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + // Seek will bump ITER_BYTES_READ + total_bytes += iter->key().size(); + total_bytes += iter->value().size(); + while (true) { + iter->Next(); + total_next++; + + if (!iter->Valid()) { + break; + } + total_next_found++; + total_bytes += iter->key().size(); + total_bytes += iter->value().size(); + } + + delete iter; + }; + + std::function reader_func_prev = [&]() { + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToLast(); + // Seek will bump ITER_BYTES_READ + total_bytes += iter->key().size(); + total_bytes += iter->value().size(); + while (true) { + iter->Prev(); + total_prev++; + + if (!iter->Valid()) { + break; + } + total_prev_found++; + total_bytes += iter->key().size(); + total_bytes += iter->value().size(); + } + + delete iter; + }; + + for (int i = 0; i < 10; i++) { + threads.emplace_back(reader_func_next); + } + for (int i = 0; i < 15; i++) { + threads.emplace_back(reader_func_prev); + } + + for (auto& t : threads) { + t.join(); + } + + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT), total_next); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT_FOUND), + total_next_found); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV), total_prev); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV_FOUND), + total_prev_found); + ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ), total_bytes); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc new file mode 100644 index 000000000..1ec125cb1 --- /dev/null +++ b/db/db_sst_test.cc @@ -0,0 +1,1300 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/sst_file_manager.h" +#include "rocksdb/sst_file_writer.h" +#include "util/sst_file_manager_impl.h" + +namespace rocksdb { + +class DBSSTTest : public DBTestBase { + public: + DBSSTTest() : DBTestBase("/db_sst_test") {} +}; + +TEST_F(DBSSTTest, DontDeletePendingOutputs) { + Options options; + options.env = env_; + options.create_if_missing = true; + DestroyAndReopen(options); + + // Every time we write to a table file, call FOF/POF with full DB scan. This + // will make sure our pending_outputs_ protection work correctly + std::function purge_obsolete_files_function = [&]() { + JobContext job_context(0); + dbfull()->TEST_LockMutex(); + dbfull()->FindObsoleteFiles(&job_context, true /*force*/); + dbfull()->TEST_UnlockMutex(); + dbfull()->PurgeObsoleteFiles(job_context); + job_context.Clean(); + }; + + env_->table_write_callback_ = &purge_obsolete_files_function; + + for (int i = 0; i < 2; ++i) { + ASSERT_OK(Put("a", "begin")); + ASSERT_OK(Put("z", "end")); + ASSERT_OK(Flush()); + } + + // If pending output guard does not work correctly, PurgeObsoleteFiles() will + // delete the file that Compaction is trying to create, causing this: error + // db/db_test.cc:975: IO error: + // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory + Compact("a", "b"); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBSSTTest, DontDeleteMovedFile) { + // This test triggers move compaction and verifies that the file is not + // deleted when it's part of move compaction + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + DestroyAndReopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->(move)->L2 compactions + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + // If the moved file is actually deleted (the move-safeguard in + // ~Version::Version() is not there), we get this failure: + // Corruption: Can't access /000009.sst + Reopen(options); +} + +// This reproduces a bug where we don't delete a file because when it was +// supposed to be deleted, it was blocked by pending_outputs +// Consider: +// 1. current file_number is 13 +// 2. compaction (1) starts, blocks deletion of all files starting with 13 +// (pending outputs) +// 3. file 13 is created by compaction (2) +// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file +// 13 has no references, it is put into VersionSet::obsolete_files_ +// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13 +// is deleted from obsolete_files_ set. +// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by +// pending outputs since compaction (1) is still running. It is not deleted and +// it is not present in obsolete_files_ anymore. Therefore, we never delete it. +TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 2 * 1024 * 1024; // 2 MB + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + options.max_background_flushes = 2; + options.max_background_compactions = 2; + + OnFileDeletionListener* listener = new OnFileDeletionListener(); + options.listeners.emplace_back(listener); + + Reopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->(move)->L2 compactions + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + test::SleepingBackgroundTask blocking_thread; + port::Mutex mutex_; + bool already_blocked(false); + + // block the flush + std::function block_first_time = [&]() { + bool blocking = false; + { + MutexLock l(&mutex_); + if (!already_blocked) { + blocking = true; + already_blocked = true; + } + } + if (blocking) { + blocking_thread.DoSleep(); + } + }; + env_->table_write_callback_ = &block_first_time; + // Create 1MB sst file + for (int j = 0; j < 256; ++j) { + ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024))); + } + // this should trigger a flush, which is blocked with block_first_time + // pending_file is protecting all the files created after + + ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr)); + + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + auto file_on_L2 = metadata[0].name; + listener->SetExpectedFileName(dbname_ + file_on_L2); + + ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, + true /* disallow trivial move */)); + ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0)); + + // finish the flush! + blocking_thread.WakeUp(); + blocking_thread.WaitUntilDone(); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ("1,0,0,0,1", FilesPerLevel(0)); + + metadata.clear(); + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 2U); + + // This file should have been deleted during last compaction + ASSERT_EQ(Status::NotFound(), env_->FileExists(dbname_ + file_on_L2)); + listener->VerifyMatchedCount(1); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBSSTTest, DBWithSstFileManager) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + int files_added = 0; + int files_deleted = 0; + int files_moved = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnAddFile", [&](void* arg) { files_added++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnDeleteFile", [&](void* arg) { files_deleted++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnMoveFile", [&](void* arg) { files_moved++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 25; i++) { + GenerateNewRandomFile(&rnd); + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + // Verify that we are tracking all sst files in dbname_ + ASSERT_EQ(sfm->GetTrackedFiles(), GetAllSSTFiles()); + } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + auto files_in_db = GetAllSSTFiles(); + // Verify that we are tracking all sst files in dbname_ + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + // Verify the total files size + uint64_t total_files_size = 0; + for (auto& file_to_size : files_in_db) { + total_files_size += file_to_size.second; + } + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + // We flushed at least 25 files + ASSERT_GE(files_added, 25); + // Compaction must have deleted some files + ASSERT_GT(files_deleted, 0); + // No files were moved + ASSERT_EQ(files_moved, 0); + + Close(); + Reopen(options); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Verify that we track all the files again after the DB is closed and opened + Close(); + sst_file_manager.reset(NewSstFileManager(env_)); + options.sst_file_manager = sst_file_manager; + sfm = static_cast(sst_file_manager.get()); + + Reopen(options); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBSSTTest, RateLimitedDelete) { + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"DBSSTTest::RateLimitedDelete:1", + "DeleteScheduler::BackgroundEmptyTrash"}, + }); + + std::vector penalties; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::BackgroundEmptyTrash:Wait", + [&](void* arg) { penalties.push_back(*(static_cast(arg))); }); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.env = env_; + + std::string trash_dir = test::TmpDir(env_) + "/trash"; + int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec + Status s; + options.sst_file_manager.reset(NewSstFileManager( + env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s)); + ASSERT_OK(s); + auto sfm = static_cast(options.sst_file_manager.get()); + + Destroy(last_options_); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(TryReopen(options)); + // Create 4 files in L0 + for (char v = 'a'; v <= 'd'; v++) { + ASSERT_OK(Put("Key2", DummyString(1024, v))); + ASSERT_OK(Put("Key3", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Put("Key1", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + + // Compaction will move the 4 files in L0 to trash and create 1 L1 file + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + uint64_t delete_start_time = env_->NowMicros(); + // Hold BackgroundEmptyTrash + TEST_SYNC_POINT("DBSSTTest::RateLimitedDelete:1"); + sfm->WaitForEmptyTrash(); + uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time; + + uint64_t total_files_size = 0; + uint64_t expected_penlty = 0; + ASSERT_EQ(penalties.size(), metadata.size()); + for (size_t i = 0; i < metadata.size(); i++) { + total_files_size += metadata[i].size; + expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec); + ASSERT_EQ(expected_penlty, penalties[i]); + } + ASSERT_GT(time_spent_deleting, expected_penlty * 0.9); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Create a DB with 2 db_paths, and generate multiple files in the 2 +// db_paths using CompactRangeOptions, make sure that files that were +// deleted from first db_path were deleted using DeleteScheduler and +// files in the second path were not. +TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) { + int bg_delete_file = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* arg) { bg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.db_paths.emplace_back(dbname_, 1024 * 100); + options.db_paths.emplace_back(dbname_ + "_2", 1024 * 100); + options.env = env_; + + std::string trash_dir = test::TmpDir(env_) + "/trash"; + int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec + Status s; + options.sst_file_manager.reset(NewSstFileManager( + env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s)); + ASSERT_OK(s); + auto sfm = static_cast(options.sst_file_manager.get()); + + DestroyAndReopen(options); + + // Create 4 files in L0 + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A'))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + // Compaction will delete files from L0 in first db path and generate a new + // file in L1 in second db path + CompactRangeOptions compact_options; + compact_options.target_path_id = 1; + Slice begin("Key0"); + Slice end("Key3"); + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + // Create 4 files in L0 + for (int i = 4; i < 8; i++) { + ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'B'))); + ASSERT_OK(Flush()); + } + ASSERT_EQ("4,1", FilesPerLevel(0)); + + // Compaction will delete files from L0 in first db path and generate a new + // file in L1 in second db path + begin = "Key4"; + end = "Key7"; + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + ASSERT_EQ("0,2", FilesPerLevel(0)); + + sfm->WaitForEmptyTrash(); + ASSERT_EQ(bg_delete_file, 8); + + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + sfm->WaitForEmptyTrash(); + ASSERT_EQ(bg_delete_file, 8); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) { + int bg_delete_file = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* arg) { bg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.env = env_; + DestroyAndReopen(options); + + // Create 4 files in L0 + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A'))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + // Close DB and destroy it using DeleteScheduler + Close(); + std::string trash_dir = test::TmpDir(env_) + "/trash"; + int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec + Status s; + options.sst_file_manager.reset(NewSstFileManager( + env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s)); + ASSERT_OK(s); + ASSERT_OK(DestroyDB(dbname_, options)); + + auto sfm = static_cast(options.sst_file_manager.get()); + sfm->WaitForEmptyTrash(); + // We have deleted the 4 sst files in the delete_scheduler + ASSERT_EQ(bg_delete_file, 4); +} +#endif // ROCKSDB_LITE + +TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + Random rnd(301); + + // Generate a file containing 100 keys. + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + + uint64_t first_file_size = 0; + auto files_in_db = GetAllSSTFiles(&first_file_size); + ASSERT_EQ(sfm->GetTotalSize(), first_file_size); + + // Set the maximum allowed space usage to the current total size + sfm->SetMaxAllowedSpaceUsage(first_file_size + 1); + + ASSERT_OK(Put("key1", "val1")); + // This flush will cause bg_error_ and will fail + ASSERT_NOK(Flush()); +} + +TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) { + // This test will set a maximum allowed space for the DB, then it will + // keep filling the DB until the limit is reached and bg_error_ is set. + // When bg_error_ is set we will verify that the DB size is greater + // than the limit. + + std::vector max_space_limits_mbs = {1, 2, 4, 8, 10}; + + bool bg_error_set = false; + uint64_t total_sst_files_size = 0; + + int reached_max_space_on_flush = 0; + int reached_max_space_on_compaction = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", + [&](void* arg) { + bg_error_set = true; + GetAllSSTFiles(&total_sst_files_size); + reached_max_space_on_flush++; + }); + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached", + [&](void* arg) { + bg_error_set = true; + GetAllSSTFiles(&total_sst_files_size); + reached_max_space_on_compaction++; + }); + + for (auto limit_mb : max_space_limits_mbs) { + bg_error_set = false; + total_sst_files_size = 0; + rocksdb::SyncPoint::GetInstance()->ClearTrace(); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.write_buffer_size = 1024 * 512; // 512 Kb + DestroyAndReopen(options); + Random rnd(301); + + sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024); + + int keys_written = 0; + uint64_t estimated_db_size = 0; + while (true) { + auto s = Put(RandomString(&rnd, 10), RandomString(&rnd, 50)); + if (!s.ok()) { + break; + } + keys_written++; + // Check the estimated db size vs the db limit just to make sure we + // dont run into an infinite loop + estimated_db_size = keys_written * 60; // ~60 bytes per key + ASSERT_LT(estimated_db_size, limit_mb * 1024 * 1024 * 2); + } + ASSERT_TRUE(bg_error_set); + ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } + + ASSERT_GT(reached_max_space_on_flush, 0); + ASSERT_GT(reached_max_space_on_compaction, 0); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) { + // Open DB with infinite max open files + // - First iteration use 1 thread to open files + // - Second iteration use 5 threads to open files + for (int iter = 0; iter < 2; iter++) { + Options options; + options.create_if_missing = true; + options.write_buffer_size = 100000; + options.disable_auto_compactions = true; + options.max_open_files = -1; + if (iter == 0) { + options.max_file_opening_threads = 1; + } else { + options.max_file_opening_threads = 5; + } + options = CurrentOptions(options); + DestroyAndReopen(options); + + // Create 12 Files in L0 (then move then to L2) + for (int i = 0; i < 12; i++) { + std::string k = "L2_" + Key(i); + ASSERT_OK(Put(k, k + std::string(1000, 'a'))); + ASSERT_OK(Flush()); + } + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + db_->CompactRange(compact_options, nullptr, nullptr); + + // Create 12 Files in L0 + for (int i = 0; i < 12; i++) { + std::string k = "L0_" + Key(i); + ASSERT_OK(Put(k, k + std::string(1000, 'a'))); + ASSERT_OK(Flush()); + } + Close(); + + // Reopening the DB will load all exisitng files + Reopen(options); + ASSERT_EQ("12,0,12", FilesPerLevel(0)); + std::vector> files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + + for (const auto& level : files) { + for (const auto& file : level) { + ASSERT_TRUE(file.table_reader_handle != nullptr); + } + } + + for (int i = 0; i < 12; i++) { + ASSERT_EQ(Get("L0_" + Key(i)), "L0_" + Key(i) + std::string(1000, 'a')); + ASSERT_EQ(Get("L2_" + Key(i)), "L2_" + Key(i) + std::string(1000, 'a')); + } + } +} + +TEST_F(DBSSTTest, GetTotalSstFilesSize) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + DestroyAndReopen(options); + // Generate 5 files in L0 + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 10; j++) { + std::string val = "val_file_" + ToString(i); + ASSERT_OK(Put(Key(j), val)); + } + Flush(); + } + ASSERT_EQ("5", FilesPerLevel(0)); + + std::vector live_files_meta; + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 5); + uint64_t single_file_size = live_files_meta[0].size; + + uint64_t live_sst_files_size = 0; + uint64_t total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 5 + // Total SST files = 5 + ASSERT_EQ(live_sst_files_size, 5 * single_file_size); + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + // hold current version + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + + // Compact 5 files into 1 file in L0 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 1); + + live_sst_files_size = 0; + total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 1 (compacted file) + // Total SST files = 6 (5 original files + compacted file) + ASSERT_EQ(live_sst_files_size, 1 * single_file_size); + ASSERT_EQ(total_sst_files_size, 6 * single_file_size); + + // hold current version + std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + + // Delete all keys and compact, this will delete all live files + for (int i = 0; i < 10; i++) { + ASSERT_OK(Delete(Key(i))); + } + Flush(); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 0); + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 6 (5 original files + compacted file) + ASSERT_EQ(total_sst_files_size, 6 * single_file_size); + + iter1.reset(); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 1 (compacted file) + ASSERT_EQ(total_sst_files_size, 1 * single_file_size); + + iter2.reset(); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 0 + ASSERT_EQ(total_sst_files_size, 0); +} + +TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + DestroyAndReopen(options); + // Generate 5 files in L0 + for (int i = 0; i < 5; i++) { + ASSERT_OK(Put(Key(i), "val")); + Flush(); + } + ASSERT_EQ("5", FilesPerLevel(0)); + + std::vector live_files_meta; + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 5); + uint64_t single_file_size = live_files_meta[0].size; + + uint64_t live_sst_files_size = 0; + uint64_t total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + + // Live SST files = 5 + // Total SST files = 5 + ASSERT_EQ(live_sst_files_size, 5 * single_file_size); + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + // hold current version + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + + // Compaction will do trivial move from L0 to L1 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,5", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 5); + + live_sst_files_size = 0; + total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 5 + // Total SST files = 5 (used in 2 version) + ASSERT_EQ(live_sst_files_size, 5 * single_file_size); + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + // hold current version + std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + + // Delete all keys and compact, this will delete all live files + for (int i = 0; i < 5; i++) { + ASSERT_OK(Delete(Key(i))); + } + Flush(); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 0); + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 5 (used in 2 version) + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + iter1.reset(); + iter2.reset(); + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 0 + ASSERT_EQ(total_sst_files_size, 0); +} + +TEST_F(DBSSTTest, AddExternalSstFile) { + do { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + options.env = env_; + const ImmutableCFOptions ioptions(options); + + SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); + + // file1.sst (0 => 99) + std::string file1 = sst_files_folder + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + // sst_file_writer already finished, cannot add this value + s = sst_file_writer.Add(Key(100), "bad_val"); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // file2.sst (100 => 199) + std::string file2 = sst_files_folder + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 100; k < 200; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + // Cannot add this key because it's not after last added key + s = sst_file_writer.Add(Key(99), "bad_val"); + ASSERT_FALSE(s.ok()) << s.ToString(); + ExternalSstFileInfo file2_info; + s = sst_file_writer.Finish(&file2_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 100); + ASSERT_EQ(file2_info.smallest_key, Key(100)); + ASSERT_EQ(file2_info.largest_key, Key(199)); + + // file3.sst (195 => 299) + // This file values overlap with file2 values + std::string file3 = sst_files_folder + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 195; k < 300; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file3_info; + s = sst_file_writer.Finish(&file3_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 105); + ASSERT_EQ(file3_info.smallest_key, Key(195)); + ASSERT_EQ(file3_info.largest_key, Key(299)); + + // file4.sst (30 => 39) + // This file values overlap with file1 values + std::string file4 = sst_files_folder + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + for (int k = 30; k < 40; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file4_info; + s = sst_file_writer.Finish(&file4_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file4_info.file_path, file4); + ASSERT_EQ(file4_info.num_entries, 10); + ASSERT_EQ(file4_info.smallest_key, Key(30)); + ASSERT_EQ(file4_info.largest_key, Key(39)); + + // file5.sst (400 => 499) + std::string file5 = sst_files_folder + "file5.sst"; + ASSERT_OK(sst_file_writer.Open(file5)); + for (int k = 400; k < 500; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file5_info; + s = sst_file_writer.Finish(&file5_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file5_info.file_path, file5); + ASSERT_EQ(file5_info.num_entries, 100); + ASSERT_EQ(file5_info.smallest_key, Key(400)); + ASSERT_EQ(file5_info.largest_key, Key(499)); + + // Cannot create an empty sst file + std::string file_empty = sst_files_folder + "file_empty.sst"; + ExternalSstFileInfo file_empty_info; + s = sst_file_writer.Finish(&file_empty_info); + ASSERT_NOK(s); + + DestroyAndReopen(options); + // Add file using file path + s = db_->AddFile(file1); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 100; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + // Add file while holding a snapshot will fail + const Snapshot* s1 = db_->GetSnapshot(); + if (s1 != nullptr) { + ASSERT_NOK(db_->AddFile(&file2_info)); + db_->ReleaseSnapshot(s1); + } + // We can add the file after releaseing the snapshot + ASSERT_OK(db_->AddFile(&file2_info)); + + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 200; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + // This file have overlapping values with the exisitng data + s = db_->AddFile(file3); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // This file have overlapping values with the exisitng data + s = db_->AddFile(&file4_info); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // Overwrite values of keys divisible by 5 + for (int k = 0; k < 200; k += 5) { + ASSERT_OK(Put(Key(k), Key(k) + "_val_new")); + } + ASSERT_NE(db_->GetLatestSequenceNumber(), 0U); + + // Key range of file5 (400 => 499) dont overlap with any keys in DB + ASSERT_OK(db_->AddFile(file5)); + + // Make sure values are correct before and after flush/compaction + for (int i = 0; i < 2; i++) { + for (int k = 0; k < 200; k++) { + std::string value = Key(k) + "_val"; + if (k % 5 == 0) { + value += "_new"; + } + ASSERT_EQ(Get(Key(k)), value); + } + for (int k = 400; k < 500; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + + Close(); + options.disable_auto_compactions = true; + Reopen(options); + + // Delete keys in range (400 => 499) + for (int k = 400; k < 500; k++) { + ASSERT_OK(Delete(Key(k))); + } + // We deleted range (400 => 499) but cannot add file5 because + // of the range tombstones + ASSERT_NOK(db_->AddFile(file5)); + + // Compacting the DB will remove the tombstones + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Now we can add the file + ASSERT_OK(db_->AddFile(file5)); + + // Verify values of file5 in DB + for (int k = 400; k < 500; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction | + kSkipFIFOCompaction)); +} + +// This test reporduce a bug that can happen in some cases if the DB started +// purging obsolete files when we are adding an external sst file. +// This situation may result in deleting the file while it's being added. +TEST_F(DBSSTTest, AddExternalSstFilePurgeObsoleteFilesBug) { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + options.env = env_; + const ImmutableCFOptions ioptions(options); + + SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); + + // file1.sst (0 => 500) + std::string sst_file_path = sst_files_folder + "file1.sst"; + Status s = sst_file_writer.Open(sst_file_path); + ASSERT_OK(s); + for (int i = 0; i < 500; i++) { + std::string k = Key(i); + s = sst_file_writer.Add(k, k + "_val"); + ASSERT_OK(s); + } + + ExternalSstFileInfo sst_file_info; + s = sst_file_writer.Finish(&sst_file_info); + ASSERT_OK(s); + + options.delete_obsolete_files_period_micros = 0; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::AddFile:FileCopied", [&](void* arg) { + ASSERT_OK(Put("aaa", "bbb")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("aaa", "xxx")); + ASSERT_OK(Flush()); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + s = db_->AddFile(sst_file_path); + ASSERT_OK(s); + + for (int i = 0; i < 500; i++) { + std::string k = Key(i); + std::string v = k + "_val"; + ASSERT_EQ(Get(k), v); + } + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, AddExternalSstFileNoCopy) { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + options.env = env_; + const ImmutableCFOptions ioptions(options); + + SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); + + // file1.sst (0 => 99) + std::string file1 = sst_files_folder + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + + // file2.sst (100 => 299) + std::string file2 = sst_files_folder + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 100; k < 300; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file2_info; + s = sst_file_writer.Finish(&file2_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 200); + ASSERT_EQ(file2_info.smallest_key, Key(100)); + ASSERT_EQ(file2_info.largest_key, Key(299)); + + // file3.sst (110 => 124) .. overlap with file2.sst + std::string file3 = sst_files_folder + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 110; k < 125; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file3_info; + s = sst_file_writer.Finish(&file3_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 15); + ASSERT_EQ(file3_info.smallest_key, Key(110)); + ASSERT_EQ(file3_info.largest_key, Key(124)); + + s = db_->AddFile(&file1_info, true /* move file */); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(Status::NotFound(), env_->FileExists(file1)); + + s = db_->AddFile(&file2_info, false /* copy file */); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(env_->FileExists(file2)); + + // This file have overlapping values with the exisitng data + s = db_->AddFile(&file3_info, true /* move file */); + ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_OK(env_->FileExists(file3)); + + for (int k = 0; k < 300; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } +} + +TEST_F(DBSSTTest, AddExternalSstFileMultiThreaded) { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + // Bulk load 10 files every file contain 1000 keys + int num_files = 10; + int keys_per_file = 1000; + + // Generate file names + std::vector file_names; + for (int i = 0; i < num_files; i++) { + std::string file_name = "file_" + ToString(i) + ".sst"; + file_names.push_back(sst_files_folder + file_name); + } + + do { + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + const ImmutableCFOptions ioptions(options); + + std::atomic thread_num(0); + std::function write_file_func = [&]() { + int file_idx = thread_num.fetch_add(1); + int range_start = file_idx * keys_per_file; + int range_end = range_start + keys_per_file; + + SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); + + ASSERT_OK(sst_file_writer.Open(file_names[file_idx])); + + for (int k = range_start; k < range_end; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k))); + } + + Status s = sst_file_writer.Finish(); + ASSERT_TRUE(s.ok()) << s.ToString(); + }; + // Write num_files files in parallel + std::vector sst_writer_threads; + for (int i = 0; i < num_files; ++i) { + sst_writer_threads.emplace_back(write_file_func); + } + + for (auto& t : sst_writer_threads) { + t.join(); + } + + fprintf(stderr, "Wrote %d files (%d keys)\n", num_files, + num_files * keys_per_file); + + thread_num.store(0); + std::atomic files_added(0); + std::function load_file_func = [&]() { + // We intentionally add every file twice, and assert that it was added + // only once and the other add failed + int thread_id = thread_num.fetch_add(1); + int file_idx = thread_id / 2; + // sometimes we use copy, sometimes link .. the result should be the same + bool move_file = (thread_id % 3 == 0); + + Status s = db_->AddFile(file_names[file_idx], move_file); + if (s.ok()) { + files_added++; + } + }; + // Bulk load num_files files in parallel + std::vector add_file_threads; + DestroyAndReopen(options); + for (int i = 0; i < num_files * 2; ++i) { + add_file_threads.emplace_back(load_file_func); + } + + for (auto& t : add_file_threads) { + t.join(); + } + ASSERT_EQ(files_added.load(), num_files); + fprintf(stderr, "Loaded %d files (%d keys)\n", num_files, + num_files * keys_per_file); + + // Overwrite values of keys divisible by 100 + for (int k = 0; k < num_files * keys_per_file; k += 100) { + std::string key = Key(k); + Status s = Put(key, key + "_new"); + ASSERT_TRUE(s.ok()); + } + + for (int i = 0; i < 2; i++) { + // Make sure the values are correct before and after flush/compaction + for (int k = 0; k < num_files * keys_per_file; ++k) { + std::string key = Key(k); + std::string value = (k % 100 == 0) ? (key + "_new") : key; + ASSERT_EQ(Get(key), value); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + + fprintf(stderr, "Verified %d values\n", num_files * keys_per_file); + } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction | + kSkipFIFOCompaction)); +} + +TEST_F(DBSSTTest, AddExternalSstFileOverlappingRanges) { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + Random rnd(301); + do { + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + DestroyAndReopen(options); + const ImmutableCFOptions ioptions(options); + SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); + + printf("Option config = %d\n", option_config_); + std::vector> key_ranges; + for (int i = 0; i < 500; i++) { + int range_start = rnd.Uniform(20000); + int keys_per_range = 10 + rnd.Uniform(41); + + key_ranges.emplace_back(range_start, range_start + keys_per_range); + } + + int memtable_add = 0; + int success_add_file = 0; + int failed_add_file = 0; + std::map true_data; + for (size_t i = 0; i < key_ranges.size(); i++) { + int range_start = key_ranges[i].first; + int range_end = key_ranges[i].second; + + Status s; + std::string range_val = "range_" + ToString(i); + + // For 20% of ranges we use DB::Put, for 80% we use DB::AddFile + if (i && i % 5 == 0) { + // Use DB::Put to insert range (insert into memtable) + range_val += "_put"; + for (int k = range_start; k <= range_end; k++) { + s = Put(Key(k), range_val); + ASSERT_OK(s); + } + memtable_add++; + } else { + // Use DB::AddFile to insert range + range_val += "_add_file"; + + // Generate the file containing the range + std::string file_name = sst_files_folder + env_->GenerateUniqueId(); + ASSERT_OK(sst_file_writer.Open(file_name)); + for (int k = range_start; k <= range_end; k++) { + s = sst_file_writer.Add(Key(k), range_val); + ASSERT_OK(s); + } + ExternalSstFileInfo file_info; + s = sst_file_writer.Finish(&file_info); + ASSERT_OK(s); + + // Insert the generated file + s = db_->AddFile(&file_info); + + auto it = true_data.lower_bound(Key(range_start)); + if (it != true_data.end() && it->first <= Key(range_end)) { + // This range overlap with data already exist in DB + ASSERT_NOK(s); + failed_add_file++; + } else { + ASSERT_OK(s); + success_add_file++; + } + } + + if (s.ok()) { + // Update true_data map to include the new inserted data + for (int k = range_start; k <= range_end; k++) { + true_data[Key(k)] = range_val; + } + } + + // Flush / Compact the DB + if (i && i % 50 == 0) { + Flush(); + } + if (i && i % 75 == 0) { + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } + } + + printf( + "Total: %zu ranges\n" + "AddFile()|Success: %d ranges\n" + "AddFile()|RangeConflict: %d ranges\n" + "Put(): %d ranges\n", + key_ranges.size(), success_add_file, failed_add_file, memtable_add); + + // Verify the correctness of the data + for (const auto& kv : true_data) { + ASSERT_EQ(Get(kv.first), kv.second); + } + printf("keys/values verified\n"); + } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction | + kSkipFIFOCompaction)); +} + +#endif // ROCKSDB_LITE + +// 1 Create some SST files by inserting K-V pairs into DB +// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file +// 3 Open DB and check if all key can be read +TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) { + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.num_levels = 4; + DestroyAndReopen(options); + + Random rnd(301); + int key_id = 0; + for (int i = 0; i < 10; ++i) { + GenerateNewFile(&rnd, &key_id, false); + } + Flush(); + Close(); + int const num_files = GetSstFileCount(dbname_); + ASSERT_GT(num_files, 0); + + std::vector filenames; + GetSstFiles(dbname_, &filenames); + int num_ldb_files = 0; + for (size_t i = 0; i < filenames.size(); ++i) { + if (i & 1) { + continue; + } + std::string const rdb_name = dbname_ + "/" + filenames[i]; + std::string const ldb_name = Rocks2LevelTableFileName(rdb_name); + ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok()); + ++num_ldb_files; + } + ASSERT_GT(num_ldb_files, 0); + ASSERT_EQ(num_files, GetSstFileCount(dbname_)); + + Reopen(options); + for (int k = 0; k < key_id; ++k) { + ASSERT_NE("NOT_FOUND", Get(Key(k))); + } + Destroy(options); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/db_test.cc b/db/db_test.cc index 70c0646f0..caa230b6a 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -39,18 +39,15 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/experimental.h" -#include "rocksdb/sst_file_manager.h" #include "rocksdb/filter_policy.h" #include "rocksdb/options.h" #include "rocksdb/perf_context.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "rocksdb/snapshot.h" -#include "rocksdb/sst_file_writer.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "rocksdb/thread_status.h" -#include "rocksdb/wal_filter.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/optimistic_transaction_db.h" @@ -65,8 +62,6 @@ #include "util/compression.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" -#include "util/sst_file_manager_impl.h" -#include "util/statistics.h" #include "util/sync_point.h" #include "util/testharness.h" #include "util/testutil.h" @@ -77,30 +72,6 @@ namespace rocksdb { -static uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { - return options.statistics->getTickerCount(ticker_type); -} - -#ifndef ROCKSDB_LITE -// A helper function that ensures the table properties returned in -// `GetPropertiesOfAllTablesTest` is correct. -// This test assumes entries size is different for each of the tables. -namespace { - -uint64_t GetNumberOfSstFilesForColumnFamily(DB* db, - std::string column_family_name) { - std::vector metadata; - db->GetLiveFilesMetaData(&metadata); - uint64_t result = 0; - for (auto& fileMetadata : metadata) { - result += (fileMetadata.column_family_name == column_family_name); - } - return result; -} - -} // namespace -#endif // ROCKSDB_LITE - class DBTest : public DBTestBase { public: DBTest() : DBTestBase("/db_test") {} @@ -369,103 +340,6 @@ TEST_F(DBTest, CompactedDB) { ASSERT_TRUE(status_list[5].IsNotFound()); } -// Make sure that when options.block_cache is set, after a new table is -// created its index/filter blocks are added to block cache. -TEST_F(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); - BlockBasedTableOptions table_options; - table_options.cache_index_and_filter_blocks = true; - table_options.filter_policy.reset(NewBloomFilterPolicy(20)); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - CreateAndReopenWithCF({"pikachu"}, options); - - ASSERT_OK(Put(1, "key", "val")); - // Create a new table. - ASSERT_OK(Flush(1)); - - // index/filter blocks added to block cache right after table creation. - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(2, /* only index/filter were added */ - TestGetTickerCount(options, BLOCK_CACHE_ADD)); - ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); - uint64_t int_num; - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_EQ(int_num, 0U); - - // Make sure filter block is in cache. - std::string value; - ReadOptions ropt; - db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); - - // Miss count should remain the same. - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); - - db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); - - // Make sure index block is in cache. - auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); - value = Get(1, "key"); - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(index_block_hit + 1, - TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); - - value = Get(1, "key"); - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(index_block_hit + 2, - TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); -} - -TEST_F(DBTest, ParanoidFileChecks) { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); - options.level0_file_num_compaction_trigger = 2; - options.paranoid_file_checks = true; - BlockBasedTableOptions table_options; - table_options.cache_index_and_filter_blocks = false; - table_options.filter_policy.reset(NewBloomFilterPolicy(20)); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - CreateAndReopenWithCF({"pikachu"}, options); - - ASSERT_OK(Put(1, "1_key", "val")); - ASSERT_OK(Put(1, "9_key", "val")); - // Create a new table. - ASSERT_OK(Flush(1)); - ASSERT_EQ(1, /* read and cache data block */ - TestGetTickerCount(options, BLOCK_CACHE_ADD)); - - ASSERT_OK(Put(1, "1_key2", "val2")); - ASSERT_OK(Put(1, "9_key2", "val2")); - // Create a new SST file. This will further trigger a compaction - // and generate another file. - ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(3, /* Totally 3 files created up to now */ - TestGetTickerCount(options, BLOCK_CACHE_ADD)); - - // After disabling options.paranoid_file_checks. NO further block - // is added after generating a new file. - ASSERT_OK( - dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}})); - - ASSERT_OK(Put(1, "1_key3", "val3")); - ASSERT_OK(Put(1, "9_key3", "val3")); - ASSERT_OK(Flush(1)); - ASSERT_OK(Put(1, "1_key4", "val4")); - ASSERT_OK(Put(1, "9_key4", "val4")); - ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(3, /* Totally 3 files created up to now */ - TestGetTickerCount(options, BLOCK_CACHE_ADD)); -} - TEST_F(DBTest, LevelLimitReopen) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu"}, options); @@ -630,43 +504,6 @@ TEST_F(DBTest, ReadFromPersistedTier) { } while (ChangeOptions(kSkipHashCuckoo)); } -TEST_F(DBTest, IteratorProperty) { - // The test needs to be changed if kPersistedTier is supported in iterator. - Options options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "1", "2"); - ReadOptions ropt; - ropt.pin_data = false; - { - unique_ptr iter(db_->NewIterator(ropt, handles_[1])); - iter->SeekToFirst(); - std::string prop_value; - ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value)); - ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); - ASSERT_EQ("0", prop_value); - iter->Next(); - ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); - ASSERT_EQ("Iterator is not valid.", prop_value); - } - Close(); -} - -TEST_F(DBTest, PersistedTierOnIterator) { - // The test needs to be changed if kPersistedTier is supported in iterator. - Options options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu"}, options); - ReadOptions ropt; - ropt.read_tier = kPersistedTier; - - auto* iter = db_->NewIterator(ropt, handles_[1]); - ASSERT_TRUE(iter->status().IsNotSupported()); - delete iter; - - std::vector iters; - ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported()); - Close(); -} - TEST_F(DBTest, SingleDeleteFlush) { // Test to check whether flushing preserves a single delete hidden // behind a put. @@ -964,9148 +801,5092 @@ TEST_F(DBTest, GetEncountersEmptyLevel) { } #endif // ROCKSDB_LITE -// KeyMayExist can lead to a few false positives, but not false negatives. -// To make test deterministic, use a much larger number of bits per key-20 than -// bits in the key, so that false positives are eliminated -TEST_F(DBTest, KeyMayExist) { - do { - ReadOptions ropts; - std::string value; - anon::OptionsOverride options_override; - options_override.filter_policy.reset(NewBloomFilterPolicy(20)); - Options options = CurrentOptions(options_override); - options.statistics = rocksdb::CreateDBStatistics(); - CreateAndReopenWithCF({"pikachu"}, options); - - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); - - ASSERT_OK(Put(1, "a", "b")); - bool value_found = false; - ASSERT_TRUE( - db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); - ASSERT_TRUE(value_found); - ASSERT_EQ("b", value); - - ASSERT_OK(Flush(1)); - value.clear(); - - uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); - uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE( - db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); - ASSERT_TRUE(!value_found); - // assert that no new files were opened and no new blocks were - // read into block cache. - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - - ASSERT_OK(Delete(1, "a")); - - numopen = TestGetTickerCount(options, NO_FILE_OPENS); - cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - - ASSERT_OK(Flush(1)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], - true /* disallow trivial move */); - - numopen = TestGetTickerCount(options, NO_FILE_OPENS); - cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - - ASSERT_OK(Delete(1, "c")); - - numopen = TestGetTickerCount(options, NO_FILE_OPENS); - cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value)); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - - // KeyMayExist function only checks data in block caches, which is not used - // by plain table format. - } while ( - ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction)); -} - -TEST_F(DBTest, NonBlockingIteration) { +TEST_F(DBTest, CheckLock) { do { - ReadOptions non_blocking_opts, regular_opts; + DB* localdb; Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - non_blocking_opts.read_tier = kBlockCacheTier; - CreateAndReopenWithCF({"pikachu"}, options); - // write one kv to the database. - ASSERT_OK(Put(1, "a", "b")); - - // scan using non-blocking iterator. We should find it because - // it is in memtable. - Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]); - int count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - count++; - } - ASSERT_EQ(count, 1); - delete iter; - - // flush memtable to storage. Now, the key should not be in the - // memtable neither in the block cache. - ASSERT_OK(Flush(1)); - - // verify that a non-blocking iterator does not find any - // kvs. Neither does it do any IOs to storage. - uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); - uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - iter = db_->NewIterator(non_blocking_opts, handles_[1]); - count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - count++; - } - ASSERT_EQ(count, 0); - ASSERT_TRUE(iter->status().IsIncomplete()); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - delete iter; - - // read in the specified block via a regular get - ASSERT_EQ(Get(1, "a"), "b"); - - // verify that we can find it via a non-blocking scan - numopen = TestGetTickerCount(options, NO_FILE_OPENS); - cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - iter = db_->NewIterator(non_blocking_opts, handles_[1]); - count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - count++; - } - ASSERT_EQ(count, 1); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - delete iter; + ASSERT_OK(TryReopen(options)); - // This test verifies block cache behaviors, which is not used by plain - // table format. - // Exclude kHashCuckoo as it does not support iteration currently - } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo | - kSkipMmapReads)); + // second open should fail + ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok()); + } while (ChangeCompactOptions()); } -#ifndef ROCKSDB_LITE -TEST_F(DBTest, ManagedNonBlockingIteration) { +TEST_F(DBTest, FlushMultipleMemtable) { do { - ReadOptions non_blocking_opts, regular_opts; Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - non_blocking_opts.read_tier = kBlockCacheTier; - non_blocking_opts.managed = true; + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.max_write_buffer_number_to_maintain = -1; CreateAndReopenWithCF({"pikachu"}, options); - // write one kv to the database. - ASSERT_OK(Put(1, "a", "b")); - - // scan using non-blocking iterator. We should find it because - // it is in memtable. - Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]); - int count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - count++; - } - ASSERT_EQ(count, 1); - delete iter; - - // flush memtable to storage. Now, the key should not be in the - // memtable neither in the block cache. + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - // verify that a non-blocking iterator does not find any - // kvs. Neither does it do any IOs to storage. - int64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); - int64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - iter = db_->NewIterator(non_blocking_opts, handles_[1]); - count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - count++; - } - ASSERT_EQ(count, 0); - ASSERT_TRUE(iter->status().IsIncomplete()); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - delete iter; - - // read in the specified block via a regular get - ASSERT_EQ(Get(1, "a"), "b"); - - // verify that we can find it via a non-blocking scan - numopen = TestGetTickerCount(options, NO_FILE_OPENS); - cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - iter = db_->NewIterator(non_blocking_opts, handles_[1]); - count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - count++; - } - ASSERT_EQ(count, 1); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - delete iter; - - // This test verifies block cache behaviors, which is not used by plain - // table format. - // Exclude kHashCuckoo as it does not support iteration currently - } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo | - kSkipMmapReads)); -} -#endif // ROCKSDB_LITE - -// A delete is skipped for key if KeyMayExist(key) returns False -// Tests Writebatch consistency and proper delete behaviour -TEST_F(DBTest, FilterDeletes) { - do { - anon::OptionsOverride options_override; - options_override.filter_policy.reset(NewBloomFilterPolicy(20)); - Options options = CurrentOptions(options_override); - options.filter_deletes = true; - CreateAndReopenWithCF({"pikachu"}, options); - WriteBatch batch; - - batch.Delete(handles_[1], "a"); - dbfull()->Write(WriteOptions(), &batch); - ASSERT_EQ(AllEntriesFor("a", 1), "[ ]"); // Delete skipped - batch.Clear(); - - batch.Put(handles_[1], "a", "b"); - batch.Delete(handles_[1], "a"); - dbfull()->Write(WriteOptions(), &batch); - ASSERT_EQ(Get(1, "a"), "NOT_FOUND"); - ASSERT_EQ(AllEntriesFor("a", 1), "[ DEL, b ]"); // Delete issued - batch.Clear(); - - batch.Delete(handles_[1], "c"); - batch.Put(handles_[1], "c", "d"); - dbfull()->Write(WriteOptions(), &batch); - ASSERT_EQ(Get(1, "c"), "d"); - ASSERT_EQ(AllEntriesFor("c", 1), "[ d ]"); // Delete skipped - batch.Clear(); - - ASSERT_OK(Flush(1)); // A stray Flush - - batch.Delete(handles_[1], "c"); - dbfull()->Write(WriteOptions(), &batch); - ASSERT_EQ(AllEntriesFor("c", 1), "[ DEL, d ]"); // Delete issued - batch.Clear(); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + ASSERT_OK(Flush(1)); } while (ChangeCompactOptions()); } -TEST_F(DBTest, GetFilterByPrefixBloom) { - Options options = last_options_; - options.prefix_extractor.reset(NewFixedPrefixTransform(8)); - options.statistics = rocksdb::CreateDBStatistics(); - BlockBasedTableOptions bbto; - bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); - bbto.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyAndReopen(options); - - WriteOptions wo; - ReadOptions ro; - FlushOptions fo; - fo.wait = true; - std::string value; - - ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); - ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); - ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); - - dbfull()->Flush(fo); +TEST_F(DBTest, FlushEmptyColumnFamily) { + // Block flush thread and disable compaction thread + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); - ASSERT_EQ("foo", Get("barbarbar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("foo2", Get("barbarbar2")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + Options options = CurrentOptions(); + // disable compaction + options.disable_auto_compactions = true; + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 2; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_number_to_maintain = 1; + CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + // Compaction can still go through even if no thread can flush the + // mem table. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); - ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); -} + // Insert can go through + ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); -TEST_F(DBTest, WholeKeyFilterProp) { - Options options = last_options_; - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - options.statistics = rocksdb::CreateDBStatistics(); + ASSERT_EQ("v1", Get(0, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); - BlockBasedTableOptions bbto; - bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); - bbto.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyAndReopen(options); + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); - WriteOptions wo; - ReadOptions ro; - FlushOptions fo; - fo.wait = true; - std::string value; + // Flush can still go through. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); - ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); - // Needs insert some keys to make sure files are not filtered out by key - // ranges. - ASSERT_OK(dbfull()->Put(wo, "aaa", "")); - ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - dbfull()->Flush(fo); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} - Reopen(options); - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - - // Reopen with whole key filtering enabled and prefix extractor - // NULL. Bloom filter should be off for both of whole key and - // prefix bloom. - bbto.whole_key_filtering = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - options.prefix_extractor.reset(); - Reopen(options); +TEST_F(DBTest, FLUSH) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + SetPerfLevel(kEnableTime); + ; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + // this will now also flush the last 2 writes + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - // Write DB with only full key filtering. - ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); - // Needs insert some keys to make sure files are not filtered out by key - // ranges. - ASSERT_OK(dbfull()->Put(wo, "aaa", "")); - ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + perf_context.Reset(); + Get(1, "foo"); + ASSERT_TRUE((int)perf_context.get_from_output_files_time > 0); - // Reopen with both of whole key off and prefix extractor enabled. - // Still no bloom filter should be used. - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - bbto.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - - // Try to create a DB with mixed files: - ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); - // Needs insert some keys to make sure files are not filtered out by key - // ranges. - ASSERT_OK(dbfull()->Put(wo, "aaa", "")); - ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); + ASSERT_OK(Flush(1)); - options.prefix_extractor.reset(); - bbto.whole_key_filtering = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v2", Get(1, "bar")); + perf_context.Reset(); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_TRUE((int)perf_context.get_from_output_files_time > 0); - // Try to create a DB with mixed files. - ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar")); - // In this case needs insert some keys to make sure files are - // not filtered out by key ranges. - ASSERT_OK(dbfull()->Put(wo, "aaa", "")); - ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - Flush(); + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); + ASSERT_OK(Flush(1)); - // Now we have two files: - // File 1: An older file with prefix bloom. - // File 2: A newer file with whole bloom filter. - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); - ASSERT_EQ("bar", Get("barfoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); - - // Reopen with the same setting: only whole key is used - Reopen(options); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); - ASSERT_EQ("bar", Get("barfoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); - - // Restart with both filters are allowed - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - bbto.whole_key_filtering = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); - // File 1 will has it filtered out. - // File 2 will not, as prefix `foo` exists in the file. - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); - ASSERT_EQ("bar", Get("barfoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); - - // Restart with only prefix bloom is allowed. - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - bbto.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); - ASSERT_EQ("bar", Get("barfoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); -} - -TEST_F(DBTest, IterSeekBeforePrev) { - ASSERT_OK(Put("a", "b")); - ASSERT_OK(Put("c", "d")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("0", "f")); - ASSERT_OK(Put("1", "h")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("2", "j")); - auto iter = db_->NewIterator(ReadOptions()); - iter->Seek(Slice("c")); - iter->Prev(); - iter->Seek(Slice("a")); - iter->Prev(); - delete iter; -} + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + // 'foo' should be there because its put + // has WAL enabled. + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "bar")); -namespace { -std::string MakeLongKey(size_t length, char c) { - return std::string(length, c); + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); } -} // namespace - -TEST_F(DBTest, IterLongKeys) { - ASSERT_OK(Put(MakeLongKey(20, 0), "0")); - ASSERT_OK(Put(MakeLongKey(32, 2), "2")); - ASSERT_OK(Put("a", "b")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put(MakeLongKey(50, 1), "1")); - ASSERT_OK(Put(MakeLongKey(127, 3), "3")); - ASSERT_OK(Put(MakeLongKey(64, 4), "4")); - auto iter = db_->NewIterator(ReadOptions()); - - // Create a key that needs to be skipped for Seq too new - iter->Seek(MakeLongKey(20, 0)); - ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4"); - delete iter; - iter = db_->NewIterator(ReadOptions()); - iter->Seek(MakeLongKey(50, 1)); - ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); - delete iter; -} +#ifndef ROCKSDB_LITE +TEST_F(DBTest, FlushSchedule) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_number_to_maintain = 1; + options.max_write_buffer_number = 2; + options.write_buffer_size = 120 * 1024; + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; -TEST_F(DBTest, IterNextWithNewerSeq) { - ASSERT_OK(Put("0", "0")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("a", "b")); - ASSERT_OK(Put("c", "d")); - ASSERT_OK(Put("d", "e")); - auto iter = db_->NewIterator(ReadOptions()); + std::atomic thread_num(0); + // each column family will have 5 thread, each thread generating 2 memtables. + // each column family should end up with 10 table files + std::function fill_memtable_func = [&]() { + int a = thread_num.fetch_add(1); + Random rnd(a); + WriteOptions wo; + // this should fill up 2 memtables + for (int k = 0; k < 5000; ++k) { + ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), "")); + } + }; - // Create a key that needs to be skipped for Seq too new - for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; - i++) { - ASSERT_OK(Put("b", "f")); + for (int i = 0; i < 10; ++i) { + threads.emplace_back(fill_memtable_func); } - iter->Seek(Slice("a")); - ASSERT_EQ(IterStatus(iter), "a->b"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->d"); - delete iter; -} + for (auto& t : threads) { + t.join(); + } -TEST_F(DBTest, IterPrevWithNewerSeq) { - ASSERT_OK(Put("0", "0")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("a", "b")); - ASSERT_OK(Put("c", "d")); - ASSERT_OK(Put("d", "e")); - auto iter = db_->NewIterator(ReadOptions()); - - // Create a key that needs to be skipped for Seq too new - for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; - i++) { - ASSERT_OK(Put("b", "f")); - } - - iter->Seek(Slice("d")); - ASSERT_EQ(IterStatus(iter), "d->e"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "c->d"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->b"); - - iter->Prev(); - delete iter; -} - -TEST_F(DBTest, IterPrevWithNewerSeq2) { - ASSERT_OK(Put("0", "0")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("a", "b")); - ASSERT_OK(Put("c", "d")); - ASSERT_OK(Put("d", "e")); - auto iter = db_->NewIterator(ReadOptions()); - iter->Seek(Slice("c")); - ASSERT_EQ(IterStatus(iter), "c->d"); - - // Create a key that needs to be skipped for Seq too new - for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; - i++) { - ASSERT_OK(Put("b", "f")); - } - - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->b"); - - iter->Prev(); - delete iter; + auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default"); + auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu"); + ASSERT_LE(default_tables, static_cast(10)); + ASSERT_GT(default_tables, static_cast(0)); + ASSERT_LE(pikachu_tables, static_cast(10)); + ASSERT_GT(pikachu_tables, static_cast(0)); } +#endif // ROCKSDB_LITE -TEST_F(DBTest, IterEmpty) { +TEST_F(DBTest, ManifestRollOver) { do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("foo"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; + Options options; + options.max_manifest_file_size = 10; // 10 bytes + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + { + ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1'))); + ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2'))); + ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3'))); + uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_OK(Flush(1)); // This should trigger LogAndApply. + uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_GT(manifest_after_flush, manifest_before_flush); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush); + // check if a new manifest file got inserted or not. + ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1")); + ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2")); + ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3")); + } } while (ChangeCompactOptions()); } -TEST_F(DBTest, IterSingle) { +TEST_F(DBTest, IdentityAcrossRestarts) { do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "a", "va")); - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek(""); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("a"); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); + std::string id1; + ASSERT_OK(db_->GetDbIdentity(id1)); - iter->Seek("b"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); + Options options = CurrentOptions(); + Reopen(options); + std::string id2; + ASSERT_OK(db_->GetDbIdentity(id2)); + // id1 should match id2 because identity was not regenerated + ASSERT_EQ(id1.compare(id2), 0); - delete iter; + std::string idfilename = IdentityFileName(dbname_); + ASSERT_OK(env_->DeleteFile(idfilename)); + Reopen(options); + std::string id3; + ASSERT_OK(db_->GetDbIdentity(id3)); + // id1 should NOT match id3 because identity was regenerated + ASSERT_NE(id1.compare(id3), 0); } while (ChangeCompactOptions()); } -TEST_F(DBTest, IterMulti) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "a", "va")); - ASSERT_OK(Put(1, "b", "vb")); - ASSERT_OK(Put(1, "c", "vc")); - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); +namespace { +class KeepFilter : public CompactionFilter { + public: + virtual bool Filter(int level, const Slice& key, const Slice& value, + std::string* new_value, + bool* value_changed) const override { + return false; + } - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek(""); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Seek("a"); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Seek("ax"); - ASSERT_EQ(IterStatus(iter), "b->vb"); - - iter->Seek("b"); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Seek("z"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - // Switch from reverse to forward - iter->SeekToLast(); - iter->Prev(); - iter->Prev(); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); + virtual const char* Name() const override { return "KeepFilter"; } +}; - // Switch from forward to reverse - iter->SeekToFirst(); - iter->Next(); - iter->Next(); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - - // Make sure iter stays at snapshot - ASSERT_OK(Put(1, "a", "va2")); - ASSERT_OK(Put(1, "a2", "va3")); - ASSERT_OK(Put(1, "b", "vb2")); - ASSERT_OK(Put(1, "c", "vc2")); - ASSERT_OK(Delete(1, "b")); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; - } while (ChangeCompactOptions()); -} +class KeepFilterFactory : public CompactionFilterFactory { + public: + explicit KeepFilterFactory(bool check_context = false) + : check_context_(check_context) {} -// Check that we can skip over a run of user keys -// by using reseek rather than sequential scan -TEST_F(DBTest, IterReseek) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - Options options = CurrentOptions(options_override); - options.max_sequential_skip_in_iterations = 3; - options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (check_context_) { + EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction); + EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); + } + return std::unique_ptr(new KeepFilter()); + } - // insert three keys with same userkey and verify that - // reseek is not invoked. For each of these test cases, - // verify that we can find the next key "b". - ASSERT_OK(Put(1, "a", "zero")); - ASSERT_OK(Put(1, "a", "one")); - ASSERT_OK(Put(1, "a", "two")); - ASSERT_OK(Put(1, "b", "bone")); - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->SeekToFirst(); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); - ASSERT_EQ(IterStatus(iter), "a->two"); - iter->Next(); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); - ASSERT_EQ(IterStatus(iter), "b->bone"); - delete iter; + virtual const char* Name() const override { return "KeepFilterFactory"; } + bool check_context_; + std::atomic_bool expect_full_compaction_; + std::atomic_bool expect_manual_compaction_; +}; - // insert a total of three keys with same userkey and verify - // that reseek is still not invoked. - ASSERT_OK(Put(1, "a", "three")); - iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->three"); - iter->Next(); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); - ASSERT_EQ(IterStatus(iter), "b->bone"); - delete iter; +class DelayFilter : public CompactionFilter { + public: + explicit DelayFilter(DBTestBase* d) : db_test(d) {} + virtual bool Filter(int level, const Slice& key, const Slice& value, + std::string* new_value, + bool* value_changed) const override { + db_test->env_->addon_time_.fetch_add(1000); + return true; + } - // insert a total of four keys with same userkey and verify - // that reseek is invoked. - ASSERT_OK(Put(1, "a", "four")); - iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->four"); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); - iter->Next(); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); - ASSERT_EQ(IterStatus(iter), "b->bone"); - delete iter; + virtual const char* Name() const override { return "DelayFilter"; } - // Testing reverse iterator - // At this point, we have three versions of "a" and one version of "b". - // The reseek statistics is already at 1. - int num_reseeks = - (int)TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION); - - // Insert another version of b and assert that reseek is not invoked - ASSERT_OK(Put(1, "b", "btwo")); - iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "b->btwo"); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), - num_reseeks); - iter->Prev(); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), - num_reseeks + 1); - ASSERT_EQ(IterStatus(iter), "a->four"); - delete iter; + private: + DBTestBase* db_test; +}; - // insert two more versions of b. This makes a total of 4 versions - // of b and 4 versions of a. - ASSERT_OK(Put(1, "b", "bthree")); - ASSERT_OK(Put(1, "b", "bfour")); - iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "b->bfour"); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), - num_reseeks + 2); - iter->Prev(); - - // the previous Prev call should have invoked reseek - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), - num_reseeks + 3); - ASSERT_EQ(IterStatus(iter), "a->four"); - delete iter; -} +class DelayFilterFactory : public CompactionFilterFactory { + public: + explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + return std::unique_ptr(new DelayFilter(db_test)); + } -TEST_F(DBTest, IterSmallAndLargeMix) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "a", "va")); - ASSERT_OK(Put(1, "b", std::string(100000, 'b'))); - ASSERT_OK(Put(1, "c", "vc")); - ASSERT_OK(Put(1, "d", std::string(100000, 'd'))); - ASSERT_OK(Put(1, "e", std::string(100000, 'e'))); + virtual const char* Name() const override { return "DelayFilterFactory"; } - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + private: + DBTestBase* db_test; +}; +} // namespace - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; - } while (ChangeCompactOptions()); +#ifndef ROCKSDB_LITE + +static std::string CompressibleString(Random* rnd, int len) { + std::string r; + test::CompressibleString(rnd, 0.8, len, &r); + return r; } +#endif // ROCKSDB_LITE -TEST_F(DBTest, IterMultiWithDelete) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "ka", "va")); - ASSERT_OK(Put(1, "kb", "vb")); - ASSERT_OK(Put(1, "kc", "vc")); - ASSERT_OK(Delete(1, "kb")); - ASSERT_EQ("NOT_FOUND", Get(1, "kb")); - - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->Seek("kc"); - ASSERT_EQ(IterStatus(iter), "kc->vc"); - if (!CurrentOptions().merge_operator) { - // TODO: merge operator does not support backward iteration yet - if (kPlainTableAllBytesPrefix != option_config_&& - kBlockBasedTableWithWholeKeyHashIndex != option_config_ && - kHashLinkList != option_config_) { - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "ka->va"); - } - } - delete iter; - } while (ChangeOptions()); +TEST_F(DBTest, FailMoreDbPaths) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 10000000); + options.db_paths.emplace_back(dbname_ + "_2", 1000000); + options.db_paths.emplace_back(dbname_ + "_3", 1000000); + options.db_paths.emplace_back(dbname_ + "_4", 1000000); + options.db_paths.emplace_back(dbname_ + "_5", 1000000); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); } -TEST_F(DBTest, IterPrevMaxSkip) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - for (int i = 0; i < 2; i++) { - ASSERT_OK(Put(1, "key1", "v1")); - ASSERT_OK(Put(1, "key2", "v2")); - ASSERT_OK(Put(1, "key3", "v3")); - ASSERT_OK(Put(1, "key4", "v4")); - ASSERT_OK(Put(1, "key5", "v5")); +void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) { + uint64_t cf_size = 0; + uint64_t cf_csize = 0; + size_t file_count = 0; + for (auto level_meta : cf_meta.levels) { + uint64_t level_size = 0; + uint64_t level_csize = 0; + file_count += level_meta.files.size(); + for (auto file_meta : level_meta.files) { + level_size += file_meta.size; } + ASSERT_EQ(level_meta.size, level_size); + cf_size += level_size; + cf_csize += level_csize; + } + ASSERT_EQ(cf_meta.file_count, file_count); + ASSERT_EQ(cf_meta.size, cf_size); +} - VerifyIterLast("key5->v5", 1); - - ASSERT_OK(Delete(1, "key5")); - VerifyIterLast("key4->v4", 1); +#ifndef ROCKSDB_LITE +TEST_F(DBTest, ColumnFamilyMetaDataTest) { + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); - ASSERT_OK(Delete(1, "key4")); - VerifyIterLast("key3->v3", 1); + Random rnd(301); + int key_index = 0; + ColumnFamilyMetaData cf_meta; + for (int i = 0; i < 100; ++i) { + GenerateNewFile(&rnd, &key_index); + db_->GetColumnFamilyMetaData(&cf_meta); + CheckColumnFamilyMeta(cf_meta); + } +} - ASSERT_OK(Delete(1, "key3")); - VerifyIterLast("key2->v2", 1); +namespace { +void MinLevelHelper(DBTest* self, Options& options) { + Random rnd(301); - ASSERT_OK(Delete(1, "key2")); - VerifyIterLast("key1->v1", 1); + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(DBTestBase::RandomString(&rnd, 10000)); + ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); + } + self->dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1); + } - ASSERT_OK(Delete(1, "key1")); - VerifyIterLast("(invalid)", 1); - } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast)); -} + // generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(DBTestBase::RandomString(&rnd, 10000)); + ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); + } + self->dbfull()->TEST_WaitForCompact(); -TEST_F(DBTest, IterWithSnapshot) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); - ASSERT_OK(Put(1, "key1", "val1")); - ASSERT_OK(Put(1, "key2", "val2")); - ASSERT_OK(Put(1, "key3", "val3")); - ASSERT_OK(Put(1, "key4", "val4")); - ASSERT_OK(Put(1, "key5", "val5")); - - const Snapshot *snapshot = db_->GetSnapshot(); - ReadOptions options; - options.snapshot = snapshot; - Iterator* iter = db_->NewIterator(options, handles_[1]); - - // Put more values after the snapshot - ASSERT_OK(Put(1, "key100", "val100")); - ASSERT_OK(Put(1, "key101", "val101")); - - iter->Seek("key5"); - ASSERT_EQ(IterStatus(iter), "key5->val5"); - if (!CurrentOptions().merge_operator) { - // TODO: merge operator does not support backward iteration yet - if (kPlainTableAllBytesPrefix != option_config_&& - kBlockBasedTableWithWholeKeyHashIndex != option_config_ && - kHashLinkList != option_config_) { - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "key4->val4"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "key3->val3"); - - iter->Next(); - ASSERT_EQ(IterStatus(iter), "key4->val4"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "key5->val5"); - } - iter->Next(); - ASSERT_TRUE(!iter->Valid()); - } - db_->ReleaseSnapshot(snapshot); - delete iter; - // skip as HashCuckooRep does not support snapshot - } while (ChangeOptions(kSkipHashCuckoo)); + ASSERT_EQ(self->NumTableFilesAtLevel(0), 0); + ASSERT_EQ(self->NumTableFilesAtLevel(1), 1); } -TEST_F(DBTest, Recover) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "foo", "v1")); - ASSERT_OK(Put(1, "baz", "v5")); +// returns false if the calling-Test should be skipped +bool MinLevelToCompress(CompressionType& type, Options& options, int wbits, + int lev, int strategy) { + fprintf(stderr, + "Test with compression options : window_bits = %d, level = %d, " + "strategy = %d}\n", + wbits, lev, strategy); + options.write_buffer_size = 100 << 10; // 100KB + options.arena_block_size = 4096; + options.num_levels = 3; + options.level0_file_num_compaction_trigger = 3; + options.create_if_missing = true; - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ASSERT_EQ("v1", Get(1, "foo")); - - ASSERT_EQ("v1", Get(1, "foo")); - ASSERT_EQ("v5", Get(1, "baz")); - ASSERT_OK(Put(1, "bar", "v2")); - ASSERT_OK(Put(1, "foo", "v3")); + if (Snappy_Supported()) { + type = kSnappyCompression; + fprintf(stderr, "using snappy\n"); + } else if (Zlib_Supported()) { + type = kZlibCompression; + fprintf(stderr, "using zlib\n"); + } else if (BZip2_Supported()) { + type = kBZip2Compression; + fprintf(stderr, "using bzip2\n"); + } else if (LZ4_Supported()) { + type = kLZ4Compression; + fprintf(stderr, "using lz4\n"); + } else { + fprintf(stderr, "skipping test, compression disabled\n"); + return false; + } + options.compression_per_level.resize(options.num_levels); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ASSERT_EQ("v3", Get(1, "foo")); - ASSERT_OK(Put(1, "foo", "v4")); - ASSERT_EQ("v4", Get(1, "foo")); - ASSERT_EQ("v2", Get(1, "bar")); - ASSERT_EQ("v5", Get(1, "baz")); - } while (ChangeOptions()); + // do not compress L0 + for (int i = 0; i < 1; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 1; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + return true; } +} // namespace -TEST_F(DBTest, RecoverWithTableHandle) { - do { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.disable_auto_compactions = true; - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - - ASSERT_OK(Put(1, "foo", "v1")); - ASSERT_OK(Put(1, "bar", "v2")); - ASSERT_OK(Flush(1)); - ASSERT_OK(Put(1, "foo", "v3")); - ASSERT_OK(Put(1, "bar", "v4")); - ASSERT_OK(Flush(1)); - ASSERT_OK(Put(1, "big", std::string(100, 'a'))); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); +TEST_F(DBTest, MinLevelToCompress1) { + Options options = CurrentOptions(); + CompressionType type = kSnappyCompression; + if (!MinLevelToCompress(type, options, -14, -1, 0)) { + return; + } + Reopen(options); + MinLevelHelper(this, options); - std::vector> files; - dbfull()->TEST_GetFilesMetaData(handles_[1], &files); - size_t total_files = 0; - for (const auto& level : files) { - total_files += level.size(); - } - ASSERT_EQ(total_files, 3); - for (const auto& level : files) { - for (const auto& file : level) { - if (kInfiniteMaxOpenFiles == option_config_) { - ASSERT_TRUE(file.table_reader_handle != nullptr); - } else { - ASSERT_TRUE(file.table_reader_handle == nullptr); - } - } - } - } while (ChangeOptions()); + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + DestroyAndReopen(options); + MinLevelHelper(this, options); } -TEST_F(DBTest, IgnoreRecoveredLog) { - std::string backup_logs = dbname_ + "/backup_logs"; +TEST_F(DBTest, MinLevelToCompress2) { + Options options = CurrentOptions(); + CompressionType type = kSnappyCompression; + if (!MinLevelToCompress(type, options, 15, -1, 0)) { + return; + } + Reopen(options); + MinLevelHelper(this, options); - // delete old files in backup_logs directory - env_->CreateDirIfMissing(backup_logs); - std::vector old_files; - env_->GetChildren(backup_logs, &old_files); - for (auto& file : old_files) { - if (file != "." && file != "..") { - env_->DeleteFile(backup_logs + "/" + file); - } + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; } + DestroyAndReopen(options); + MinLevelHelper(this, options); +} +TEST_F(DBTest, RepeatedWritesToSameKey) { do { Options options = CurrentOptions(); - options.create_if_missing = true; - options.merge_operator = MergeOperators::CreateUInt64AddOperator(); - options.wal_dir = dbname_ + "/logs"; - DestroyAndReopen(options); - - // fill up the DB - std::string one, two; - PutFixed64(&one, 1); - PutFixed64(&two, 2); - ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); - ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); - ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one))); - - // copy the logs to backup - std::vector logs; - env_->GetChildren(options.wal_dir, &logs); - for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log); - } - } - - // recover the DB - Reopen(options); - ASSERT_EQ(two, Get("foo")); - ASSERT_EQ(one, Get("bar")); - Close(); - - // copy the logs from backup back to wal dir - for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); - } - } - // this should ignore the log files, recovery should not happen again - // if the recovery happens, the same merge operator would be called twice, - // leading to incorrect results - Reopen(options); - ASSERT_EQ(two, Get("foo")); - ASSERT_EQ(one, Get("bar")); - Close(); - Destroy(options); - Reopen(options); - Close(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + CreateAndReopenWithCF({"pikachu"}, options); - // copy the logs from backup back to wal dir - env_->CreateDirIfMissing(options.wal_dir); - for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); - } - } - // assert that we successfully recovered only from logs, even though we - // destroyed the DB - Reopen(options); - ASSERT_EQ(two, Get("foo")); - ASSERT_EQ(one, Get("bar")); + // We must have at most one file per level except for level-0, + // which may have up to kL0_StopWritesTrigger files. + const int kMaxFiles = + options.num_levels + options.level0_stop_writes_trigger; - // Recovery will fail if DB directory doesn't exist. - Destroy(options); - // copy the logs from backup back to wal dir - env_->CreateDirIfMissing(options.wal_dir); - for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); - // we won't be needing this file no more - env_->DeleteFile(backup_logs + "/" + log); - } + Random rnd(301); + std::string value = + RandomString(&rnd, static_cast(2 * options.write_buffer_size)); + for (int i = 0; i < 5 * kMaxFiles; i++) { + ASSERT_OK(Put(1, "key", value)); + ASSERT_LE(TotalTableFiles(1), kMaxFiles); } - Status s = TryReopen(options); - ASSERT_TRUE(!s.ok()); - } while (ChangeOptions(kSkipHashCuckoo)); -} - -TEST_F(DBTest, CheckLock) { - do { - DB* localdb; - Options options = CurrentOptions(); - ASSERT_OK(TryReopen(options)); - - // second open should fail - ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok()); } while (ChangeCompactOptions()); } +#endif // ROCKSDB_LITE -TEST_F(DBTest, FlushMultipleMemtable) { +TEST_F(DBTest, SparseMerge) { do { Options options = CurrentOptions(); - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - options.max_write_buffer_number = 4; - options.min_write_buffer_number_to_merge = 3; - options.max_write_buffer_number_to_maintain = -1; + options.compression = kNoCompression; CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + + FillLevels("A", "Z", 1); + + // Suppose there is: + // small amount of data with prefix A + // large amount of data with prefix B + // small amount of data with prefix C + // and that recent updates have made small changes to all three prefixes. + // Check that we do not do a compaction that merges all of B in one shot. + const std::string value(1000, 'x'); + Put(1, "A", "va"); + // Write approximately 100MB of "B" values + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(1, key, value); + } + Put(1, "C", "vc"); ASSERT_OK(Flush(1)); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - ASSERT_EQ("v1", Get(1, "foo")); - ASSERT_EQ("v1", Get(1, "bar")); + // Make sparse update + Put(1, "A", "va2"); + Put(1, "B100", "bvalue2"); + Put(1, "C", "vc2"); ASSERT_OK(Flush(1)); + + // Compactions should not cause us to create a situation where + // a file overlaps too much data at the next level. + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); } while (ChangeCompactOptions()); } -TEST_F(DBTest, FlushEmptyColumnFamily) { - // Block flush thread and disable compaction thread - env_->SetBackgroundThreads(1, Env::HIGH); - env_->SetBackgroundThreads(1, Env::LOW); - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - test::SleepingBackgroundTask sleeping_task_high; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_high, Env::Priority::HIGH); +#ifndef ROCKSDB_LITE +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} +TEST_F(DBTest, ApproximateSizesMemTable) { Options options = CurrentOptions(); - // disable compaction - options.disable_auto_compactions = true; - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - options.max_write_buffer_number = 2; - options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 1; - CreateAndReopenWithCF({"pikachu"}, options); - - // Compaction can still go through even if no thread can flush the - // mem table. - ASSERT_OK(Flush(0)); - ASSERT_OK(Flush(1)); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + options.create_if_missing = true; + DestroyAndReopen(options); - // Insert can go through - ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1")); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + const int N = 128; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } - ASSERT_EQ("v1", Get(0, "foo")); - ASSERT_EQ("v1", Get(1, "bar")); + uint64_t size; + std::string start = Key(50); + std::string end = Key(60); + Range r(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_GT(size, 6000); + ASSERT_LT(size, 204800); + // Zero if not including mem table + db_->GetApproximateSizes(&r, 1, &size, false); + ASSERT_EQ(size, 0); - sleeping_task_high.WakeUp(); - sleeping_task_high.WaitUntilDone(); + start = Key(500); + end = Key(600); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_EQ(size, 0); - // Flush can still go through. - ASSERT_OK(Flush(0)); - ASSERT_OK(Flush(1)); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024))); + } - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); -} + start = Key(500); + end = Key(600); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_EQ(size, 0); -TEST_F(DBTest, FLUSH) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - SetPerfLevel(kEnableTime);; - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); - // this will now also flush the last 2 writes - ASSERT_OK(Flush(1)); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + start = Key(100); + end = Key(1020); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_GT(size, 6000); - perf_context.Reset(); - Get(1, "foo"); - ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); + options.max_write_buffer_number = 8; + options.min_write_buffer_number_to_merge = 5; + options.write_buffer_size = 1024 * N; // Not very large + DestroyAndReopen(options); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ASSERT_EQ("v1", Get(1, "foo")); - ASSERT_EQ("v1", Get(1, "bar")); + int keys[N * 3]; + for (int i = 0; i < N; i++) { + keys[i * 3] = i * 5; + keys[i * 3 + 1] = i * 5 + 1; + keys[i * 3 + 2] = i * 5 + 2; + } + std::random_shuffle(std::begin(keys), std::end(keys)); - writeOpt.disableWAL = true; - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); - ASSERT_OK(Flush(1)); + for (int i = 0; i < N * 3; i++) { + ASSERT_OK(Put(Key(keys[i] + 1000), RandomString(&rnd, 1024))); + } - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ASSERT_EQ("v2", Get(1, "bar")); - perf_context.Reset(); - ASSERT_EQ("v2", Get(1, "foo")); - ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); + start = Key(100); + end = Key(300); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_EQ(size, 0); - writeOpt.disableWAL = false; - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); - ASSERT_OK(Flush(1)); + start = Key(1050); + end = Key(1080); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_GT(size, 6000); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - // 'foo' should be there because its put - // has WAL enabled. - ASSERT_EQ("v3", Get(1, "foo")); - ASSERT_EQ("v3", Get(1, "bar")); + start = Key(2100); + end = Key(2300); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_EQ(size, 0); - SetPerfLevel(kDisable); - } while (ChangeCompactOptions()); + start = Key(1050); + end = Key(1080); + r = Range(start, end); + uint64_t size_with_mt, size_without_mt; + db_->GetApproximateSizes(&r, 1, &size_with_mt, true); + ASSERT_GT(size_with_mt, 6000); + db_->GetApproximateSizes(&r, 1, &size_without_mt, false); + ASSERT_EQ(size_without_mt, 0); + + Flush(); + + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i + 1000), RandomString(&rnd, 1024))); + } + + start = Key(1050); + end = Key(1080); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size_with_mt, true); + db_->GetApproximateSizes(&r, 1, &size_without_mt, false); + ASSERT_GT(size_with_mt, size_without_mt); + ASSERT_GT(size_without_mt, 6000); } -TEST_F(DBTest, RecoveryWithEmptyLog) { +TEST_F(DBTest, ApproximateSizes) { do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "foo", "v1")); - ASSERT_OK(Put(1, "foo", "v2")); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "foo", "v3")); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ASSERT_EQ("v3", Get(1, "foo")); - } while (ChangeOptions()); -} + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); -#ifndef ROCKSDB_LITE -TEST_F(DBTest, FlushSchedule) { - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.level0_stop_writes_trigger = 1 << 10; - options.level0_slowdown_writes_trigger = 1 << 10; - options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 1; - options.max_write_buffer_number = 2; - options.write_buffer_size = 120 * 1024; - CreateAndReopenWithCF({"pikachu"}, options); - std::vector threads; + ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); - std::atomic thread_num(0); - // each column family will have 5 thread, each thread generating 2 memtables. - // each column family should end up with 10 table files - std::function fill_memtable_func = [&]() { - int a = thread_num.fetch_add(1); - Random rnd(a); - WriteOptions wo; - // this should fill up 2 memtables - for (int k = 0; k < 5000; ++k) { - ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), "")); + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + const int N = 80; + static const int S1 = 100000; + static const int S2 = 105000; // Allow some expansion from metadata + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1))); } - }; - for (int i = 0; i < 10; ++i) { - threads.emplace_back(fill_memtable_func); - } + // 0 because GetApproximateSizes() does not account for memtable space + ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0)); - for (auto& t : threads) { - t.join(); - } + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + ReopenWithColumnFamilies({"default", "pikachu"}, options); - auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default"); - auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu"); - ASSERT_LE(default_tables, static_cast(10)); - ASSERT_GT(default_tables, static_cast(0)); - ASSERT_LE(pikachu_tables, static_cast(10)); - ASSERT_GT(pikachu_tables, static_cast(0)); + for (int compact_start = 0; compact_start < N; compact_start += 10) { + for (int i = 0; i < N; i += 10) { + ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i)); + ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1), + S2 * (i + 1))); + ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10)); + } + ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50)); + ASSERT_TRUE( + Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50)); + + std::string cstart_str = Key(compact_start); + std::string cend_str = Key(compact_start + 9); + Slice cstart = cstart_str; + Slice cend = cend_str; + dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]); + } + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); + } + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | + kSkipPlainTable | kSkipHashIndex)); } -#endif // ROCKSDB_LITE -TEST_F(DBTest, ManifestRollOver) { +TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) { do { - Options options; - options.max_manifest_file_size = 10 ; // 10 bytes - options = CurrentOptions(options); + Options options = CurrentOptions(); + options.compression = kNoCompression; CreateAndReopenWithCF({"pikachu"}, options); - { - ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1'))); - ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2'))); - ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3'))); - uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo(); - ASSERT_OK(Flush(1)); // This should trigger LogAndApply. - uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo(); - ASSERT_GT(manifest_after_flush, manifest_before_flush); + + Random rnd(301); + std::string big1 = RandomString(&rnd, 100000); + ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(2), big1)); + ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(4), big1)); + ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000))); + ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000))); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush); - // check if a new manifest file got inserted or not. - ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1")); - ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2")); - ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3")); - } - } while (ChangeCompactOptions()); -} -TEST_F(DBTest, IdentityAcrossRestarts) { - do { - std::string id1; - ASSERT_OK(db_->GetDbIdentity(id1)); + ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0)); + ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000)); + ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000)); + ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000)); + ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000)); + ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000)); + ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000)); + ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000)); + ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000)); - Options options = CurrentOptions(); - Reopen(options); - std::string id2; - ASSERT_OK(db_->GetDbIdentity(id2)); - // id1 should match id2 because identity was not regenerated - ASSERT_EQ(id1.compare(id2), 0); + ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000)); - std::string idfilename = IdentityFileName(dbname_); - ASSERT_OK(env_->DeleteFile(idfilename)); - Reopen(options); - std::string id3; - ASSERT_OK(db_->GetDbIdentity(id3)); - // id1 should NOT match id3 because identity was regenerated - ASSERT_NE(id1.compare(id3), 0); - } while (ChangeCompactOptions()); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + } + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipPlainTable)); } +#endif // ROCKSDB_LITE #ifndef ROCKSDB_LITE -TEST_F(DBTest, RecoverWithLargeLog) { +TEST_F(DBTest, Snapshot) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; do { - { - Options options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_OK(Put(1, "big1", std::string(200000, '1'))); - ASSERT_OK(Put(1, "big2", std::string(200000, '2'))); - ASSERT_OK(Put(1, "small3", std::string(10, '3'))); - ASSERT_OK(Put(1, "small4", std::string(10, '4'))); - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - } + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); + Put(0, "foo", "0v1"); + Put(1, "foo", "1v1"); - // Make sure that if we re-open with a small write buffer size that - // we flush table files in the middle of a large log file. - Options options; - options.write_buffer_size = 100000; - options = CurrentOptions(options); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3); - ASSERT_EQ(std::string(200000, '1'), Get(1, "big1")); - ASSERT_EQ(std::string(200000, '2'), Get(1, "big2")); - ASSERT_EQ(std::string(10, '3'), Get(1, "small3")); - ASSERT_EQ(std::string(10, '4'), Get(1, "small4")); - ASSERT_GT(NumTableFilesAtLevel(0, 1), 1); - } while (ChangeCompactOptions()); -} -#endif // ROCKSDB_LITE + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_EQ(1U, GetNumSnapshots()); + uint64_t time_snap1 = GetTimeOldestSnapshots(); + ASSERT_GT(time_snap1, 0U); + Put(0, "foo", "0v2"); + Put(1, "foo", "1v2"); -namespace { -class KeepFilter : public CompactionFilter { - public: - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, bool* value_changed) const - override { - return false; - } + env_->addon_time_.fetch_add(1); - virtual const char* Name() const override { return "KeepFilter"; } -}; + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + Put(0, "foo", "0v3"); + Put(1, "foo", "1v3"); -class KeepFilterFactory : public CompactionFilterFactory { - public: - explicit KeepFilterFactory(bool check_context = false) - : check_context_(check_context) {} + { + ManagedSnapshot s3(db_); + ASSERT_EQ(3U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); - virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { - if (check_context_) { - EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction); - EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); + Put(0, "foo", "0v4"); + Put(1, "foo", "1v4"); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot())); + ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot())); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); } - return std::unique_ptr(new KeepFilter()); - } - - virtual const char* Name() const override { return "KeepFilterFactory"; } - bool check_context_; - std::atomic_bool expect_full_compaction_; - std::atomic_bool expect_manual_compaction_; -}; -class DelayFilter : public CompactionFilter { - public: - explicit DelayFilter(DBTestBase* d) : db_test(d) {} - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, - bool* value_changed) const override { - db_test->env_->addon_time_.fetch_add(1000); - return true; - } + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); - virtual const char* Name() const override { return "DelayFilter"; } + db_->ReleaseSnapshot(s1); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + ASSERT_EQ(1U, GetNumSnapshots()); + ASSERT_LT(time_snap1, GetTimeOldestSnapshots()); - private: - DBTestBase* db_test; -}; + db_->ReleaseSnapshot(s2); + ASSERT_EQ(0U, GetNumSnapshots()); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + } while (ChangeOptions(kSkipHashCuckoo)); +} -class DelayFilterFactory : public CompactionFilterFactory { - public: - explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} - virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { - return std::unique_ptr(new DelayFilter(db_test)); - } +TEST_F(DBTest, HiddenValuesAreRemoved) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + Options options = CurrentOptions(options_override); + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + FillLevels("a", "z", 1); - virtual const char* Name() const override { return "DelayFilterFactory"; } + std::string big = RandomString(&rnd, 50000); + Put(1, "foo", big); + Put(1, "pastfoo", "v"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put(1, "foo", "tiny"); + Put(1, "pastfoo2", "v2"); // Advance sequence number one more - private: - DBTestBase* db_test; -}; -} // namespace + ASSERT_OK(Flush(1)); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 0); -#ifndef ROCKSDB_LITE -TEST_F(DBTest, CompressedCache) { - if (!Snappy_Supported()) { - return; - } - int num_iter = 80; + ASSERT_EQ(big, Get(1, "foo", snapshot)); + ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000)); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]"); + Slice x("x"); + dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GE(NumTableFilesAtLevel(1, 1), 1); + dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); - // Run this test three iterations. - // Iteration 1: only a uncompressed block cache - // Iteration 2: only a compressed block cache - // Iteration 3: both block cache and compressed cache - // Iteration 4: both block cache and compressed cache, but DB is not - // compressed - for (int iter = 0; iter < 4; iter++) { - Options options = CurrentOptions(); - options.write_buffer_size = 64*1024; // small write buffer - options.statistics = rocksdb::CreateDBStatistics(); + ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000)); + // ApproximateOffsetOf() is not yet implemented in plain table format, + // which is used by Size(). + // skip HashCuckooRep as it does not support snapshot + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | + kSkipPlainTable | kSkipHashCuckoo)); +} +#endif // ROCKSDB_LITE - BlockBasedTableOptions table_options; - switch (iter) { - case 0: - // only uncompressed block cache - table_options.block_cache = NewLRUCache(8*1024); - table_options.block_cache_compressed = nullptr; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - break; - case 1: - // no block cache, only compressed cache - table_options.no_block_cache = true; - table_options.block_cache = nullptr; - table_options.block_cache_compressed = NewLRUCache(8*1024); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - break; - case 2: - // both compressed and uncompressed block cache - table_options.block_cache = NewLRUCache(1024); - table_options.block_cache_compressed = NewLRUCache(8*1024); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - break; - case 3: - // both block cache and compressed cache, but DB is not compressed - // also, make block cache sizes bigger, to trigger block cache hits - table_options.block_cache = NewLRUCache(1024 * 1024); - table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.compression = kNoCompression; - break; - default: - ASSERT_TRUE(false); - } +TEST_F(DBTest, CompactBetweenSnapshots) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + Options options = CurrentOptions(options_override); + options.disable_auto_compactions = true; CreateAndReopenWithCF({"pikachu"}, options); - // default column family doesn't have block cache - Options no_block_cache_opts; - no_block_cache_opts.statistics = options.statistics; - no_block_cache_opts = CurrentOptions(no_block_cache_opts); - BlockBasedTableOptions table_options_no_bc; - table_options_no_bc.no_block_cache = true; - no_block_cache_opts.table_factory.reset( - NewBlockBasedTableFactory(table_options_no_bc)); - ReopenWithColumnFamilies({"default", "pikachu"}, - std::vector({no_block_cache_opts, options})); - Random rnd(301); + FillLevels("a", "z", 1); - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - std::vector values; - std::string str; - for (int i = 0; i < num_iter; i++) { - if (i % 4 == 0) { // high compression ratio - str = RandomString(&rnd, 1000); - } - values.push_back(str); - ASSERT_OK(Put(1, Key(i), values[i])); - } + Put(1, "foo", "first"); + const Snapshot* snapshot1 = db_->GetSnapshot(); + Put(1, "foo", "second"); + Put(1, "foo", "third"); + Put(1, "foo", "fourth"); + const Snapshot* snapshot2 = db_->GetSnapshot(); + Put(1, "foo", "fifth"); + Put(1, "foo", "sixth"); + + // All entries (including duplicates) exist + // before any compaction or flush is triggered. + ASSERT_EQ(AllEntriesFor("foo", 1), + "[ sixth, fifth, fourth, third, second, first ]"); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ("first", Get(1, "foo", snapshot1)); - // flush all data from memtable so that reads are from block cache + // After a flush, "second", "third" and "fifth" should + // be removed ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]"); - for (int i = 0; i < num_iter; i++) { - ASSERT_EQ(Get(1, Key(i)), values[i]); - } + // after we release the snapshot1, only two values left + db_->ReleaseSnapshot(snapshot1); + FillLevels("a", "z", 1); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); - // check that we triggered the appropriate code paths in the cache - switch (iter) { - case 0: - // only uncompressed block cache - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); - break; - case 1: - // no block cache, only compressed cache - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); - break; - case 2: - // both compressed and uncompressed block cache - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); - break; - case 3: - // both compressed and uncompressed block cache - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0); - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); - // compressed doesn't have any hits since blocks are not compressed on - // storage - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0); - break; - default: - ASSERT_TRUE(false); - } + // We have only one valid snapshot snapshot2. Since snapshot1 is + // not valid anymore, "first" should be removed by a compaction. + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]"); - options.create_if_missing = true; - DestroyAndReopen(options); - } + // after we release the snapshot2, only one value should be left + db_->ReleaseSnapshot(snapshot2); + FillLevels("a", "z", 1); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]"); + // skip HashCuckooRep as it does not support snapshot + } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction)); } -static std::string CompressibleString(Random* rnd, int len) { - std::string r; - test::CompressibleString(rnd, 0.8, len, &r); - return r; -} -#endif // ROCKSDB_LITE +TEST_F(DBTest, UnremovableSingleDelete) { + // If we compact: + // + // Put(A, v1) Snapshot SingleDelete(A) Put(A, v2) + // + // We do not want to end up with: + // + // Put(A, v1) Snapshot Put(A, v2) + // + // Because a subsequent SingleDelete(A) would delete the Put(A, v2) + // but not Put(A, v1), so Get(A) would return v1. + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + Options options = CurrentOptions(options_override); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); -TEST_F(DBTest, FailMoreDbPaths) { - Options options = CurrentOptions(); - options.db_paths.emplace_back(dbname_, 10000000); - options.db_paths.emplace_back(dbname_ + "_2", 1000000); - options.db_paths.emplace_back(dbname_ + "_3", 1000000); - options.db_paths.emplace_back(dbname_ + "_4", 1000000); - options.db_paths.emplace_back(dbname_ + "_5", 1000000); - ASSERT_TRUE(TryReopen(options).IsNotSupported()); -} + Put(1, "foo", "first"); + const Snapshot* snapshot = db_->GetSnapshot(); + SingleDelete(1, "foo"); + Put(1, "foo", "second"); + ASSERT_OK(Flush(1)); -void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) { - uint64_t cf_size = 0; - uint64_t cf_csize = 0; - size_t file_count = 0; - for (auto level_meta : cf_meta.levels) { - uint64_t level_size = 0; - uint64_t level_csize = 0; - file_count += level_meta.files.size(); - for (auto file_meta : level_meta.files) { - level_size += file_meta.size; - } - ASSERT_EQ(level_meta.size, level_size); - cf_size += level_size; - cf_csize += level_csize; - } - ASSERT_EQ(cf_meta.file_count, file_count); - ASSERT_EQ(cf_meta.size, cf_size); -} - -#ifndef ROCKSDB_LITE -TEST_F(DBTest, ColumnFamilyMetaDataTest) { - Options options = CurrentOptions(); - options.create_if_missing = true; - DestroyAndReopen(options); + ASSERT_EQ("first", Get(1, "foo", snapshot)); + ASSERT_EQ("second", Get(1, "foo")); - Random rnd(301); - int key_index = 0; - ColumnFamilyMetaData cf_meta; - for (int i = 0; i < 100; ++i) { - GenerateNewFile(&rnd, &key_index); - db_->GetColumnFamilyMetaData(&cf_meta); - CheckColumnFamilyMeta(cf_meta); - } -} + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1)); -namespace { -void MinLevelHelper(DBTest* self, Options& options) { - Random rnd(301); + SingleDelete(1, "foo"); - for (int num = 0; - num < options.level0_file_num_compaction_trigger - 1; - num++) - { - std::vector values; - // Write 120KB (12 values, each 10K) - for (int i = 0; i < 12; i++) { - values.push_back(DBTestBase::RandomString(&rnd, 10000)); - ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); - } - self->dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1); - } + ASSERT_EQ("first", Get(1, "foo", snapshot)); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); - //generate one more file in level-0, and should trigger level-0 compaction - std::vector values; - for (int i = 0; i < 12; i++) { - values.push_back(DBTestBase::RandomString(&rnd, 10000)); - ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); - } - self->dbfull()->TEST_WaitForCompact(); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); - ASSERT_EQ(self->NumTableFilesAtLevel(0), 0); - ASSERT_EQ(self->NumTableFilesAtLevel(1), 1); + ASSERT_EQ("first", Get(1, "foo", snapshot)); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + db_->ReleaseSnapshot(snapshot); + // Skip HashCuckooRep as it does not support single delete. FIFO and + // universal compaction do not apply to the test case. Skip MergePut + // because single delete does not get removed when it encounters a merge. + } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction | + kSkipUniversalCompaction | kSkipMergePut)); } -// returns false if the calling-Test should be skipped -bool MinLevelToCompress(CompressionType& type, Options& options, int wbits, - int lev, int strategy) { - fprintf(stderr, "Test with compression options : window_bits = %d, level = %d, strategy = %d}\n", wbits, lev, strategy); - options.write_buffer_size = 100<<10; //100KB - options.arena_block_size = 4096; - options.num_levels = 3; - options.level0_file_num_compaction_trigger = 3; - options.create_if_missing = true; +#ifndef ROCKSDB_LITE +TEST_F(DBTest, DeletionMarkers1) { + Options options = CurrentOptions(); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, options); + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + const int last = 2; + MoveFilesToLevel(last, 1); + // foo => v1 is now in last level + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); - if (Snappy_Supported()) { - type = kSnappyCompression; - fprintf(stderr, "using snappy\n"); - } else if (Zlib_Supported()) { - type = kZlibCompression; - fprintf(stderr, "using zlib\n"); - } else if (BZip2_Supported()) { - type = kBZip2Compression; - fprintf(stderr, "using bzip2\n"); - } else if (LZ4_Supported()) { - type = kLZ4Compression; - fprintf(stderr, "using lz4\n"); - } else { - fprintf(stderr, "skipping test, compression disabled\n"); - return false; - } - options.compression_per_level.resize(options.num_levels); + // Place a table at level last-1 to prevent merging with preceding mutation + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); + MoveFilesToLevel(last - 1, 1); + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); - // do not compress L0 - for (int i = 0; i < 1; i++) { - options.compression_per_level[i] = kNoCompression; - } - for (int i = 1; i < options.num_levels; i++) { - options.compression_per_level[i] = type; - } - return true; + Delete(1, "foo"); + Put(1, "foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); + ASSERT_OK(Flush(1)); // Moves to level last-2 + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + Slice z("z"); + dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]); + // DEL eliminated, but v1 remains because we aren't compacting that level + // (DEL can be eliminated because v2 hides v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); } -} // namespace -TEST_F(DBTest, MinLevelToCompress1) { +TEST_F(DBTest, DeletionMarkers2) { Options options = CurrentOptions(); - CompressionType type = kSnappyCompression; - if (!MinLevelToCompress(type, options, -14, -1, 0)) { - return; - } - Reopen(options); - MinLevelHelper(this, options); - - // do not compress L0 and L1 - for (int i = 0; i < 2; i++) { - options.compression_per_level[i] = kNoCompression; - } - for (int i = 2; i < options.num_levels; i++) { - options.compression_per_level[i] = type; - } - DestroyAndReopen(options); - MinLevelHelper(this, options); -} + CreateAndReopenWithCF({"pikachu"}, options); + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + const int last = 2; + MoveFilesToLevel(last, 1); + // foo => v1 is now in last level + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); -TEST_F(DBTest, MinLevelToCompress2) { - Options options = CurrentOptions(); - CompressionType type = kSnappyCompression; - if (!MinLevelToCompress(type, options, 15, -1, 0)) { - return; - } - Reopen(options); - MinLevelHelper(this, options); + // Place a table at level last-1 to prevent merging with preceding mutation + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); + MoveFilesToLevel(last - 1, 1); + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); - // do not compress L0 and L1 - for (int i = 0; i < 2; i++) { - options.compression_per_level[i] = kNoCompression; - } - for (int i = 2; i < options.num_levels; i++) { - options.compression_per_level[i] = type; - } - DestroyAndReopen(options); - MinLevelHelper(this, options); + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + ASSERT_OK(Flush(1)); // Moves to level last-2 + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]); + // DEL kept: "last" file overlaps + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); } -TEST_F(DBTest, RepeatedWritesToSameKey) { +TEST_F(DBTest, OverlapInLevel0) { do { Options options = CurrentOptions(); - options.env = env_; - options.write_buffer_size = 100000; // Small write buffer CreateAndReopenWithCF({"pikachu"}, options); - // We must have at most one file per level except for level-0, - // which may have up to kL0_StopWritesTrigger files. - const int kMaxFiles = - options.num_levels + options.level0_stop_writes_trigger; + // Fill levels 1 and 2 to disable the pushing of new memtables to levels > + // 0. + ASSERT_OK(Put(1, "100", "v100")); + ASSERT_OK(Put(1, "999", "v999")); + Flush(1); + MoveFilesToLevel(2, 1); + ASSERT_OK(Delete(1, "100")); + ASSERT_OK(Delete(1, "999")); + Flush(1); + MoveFilesToLevel(1, 1); + ASSERT_EQ("0,1,1", FilesPerLevel(1)); - Random rnd(301); - std::string value = - RandomString(&rnd, static_cast(2 * options.write_buffer_size)); - for (int i = 0; i < 5 * kMaxFiles; i++) { - ASSERT_OK(Put(1, "key", value)); - ASSERT_LE(TotalTableFiles(1), kMaxFiles); - } - } while (ChangeCompactOptions()); -} -#endif // ROCKSDB_LITE + // Make files spanning the following ranges in level-0: + // files[0] 200 .. 900 + // files[1] 300 .. 500 + // Note that files are sorted by smallest key. + ASSERT_OK(Put(1, "300", "v300")); + ASSERT_OK(Put(1, "500", "v500")); + Flush(1); + ASSERT_OK(Put(1, "200", "v200")); + ASSERT_OK(Put(1, "600", "v600")); + ASSERT_OK(Put(1, "900", "v900")); + Flush(1); + ASSERT_EQ("2,1,1", FilesPerLevel(1)); -TEST_F(DBTest, SparseMerge) { - do { - Options options = CurrentOptions(); - options.compression = kNoCompression; - CreateAndReopenWithCF({"pikachu"}, options); + // Compact away the placeholder files we created initially + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]); + ASSERT_EQ("2", FilesPerLevel(1)); - FillLevels("A", "Z", 1); + // Do a memtable compaction. Before bug-fix, the compaction would + // not detect the overlap with level-0 files and would incorrectly place + // the deletion in a deeper level. + ASSERT_OK(Delete(1, "600")); + Flush(1); + ASSERT_EQ("3", FilesPerLevel(1)); + ASSERT_EQ("NOT_FOUND", Get(1, "600")); + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); +} +#endif // ROCKSDB_LITE - // Suppose there is: - // small amount of data with prefix A - // large amount of data with prefix B - // small amount of data with prefix C - // and that recent updates have made small changes to all three prefixes. - // Check that we do not do a compaction that merges all of B in one shot. - const std::string value(1000, 'x'); - Put(1, "A", "va"); - // Write approximately 100MB of "B" values - for (int i = 0; i < 100000; i++) { - char key[100]; - snprintf(key, sizeof(key), "B%010d", i); - Put(1, key, value); +TEST_F(DBTest, ComparatorCheck) { + class NewComparator : public Comparator { + public: + virtual const char* Name() const override { + return "rocksdb.NewComparator"; } - Put(1, "C", "vc"); - ASSERT_OK(Flush(1)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - - // Make sparse update - Put(1, "A", "va2"); - Put(1, "B100", "bvalue2"); - Put(1, "C", "vc2"); - ASSERT_OK(Flush(1)); - - // Compactions should not cause us to create a situation where - // a file overlaps too much data at the next level. - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), - 20 * 1048576); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), - 20 * 1048576); - dbfull()->TEST_CompactRange(1, nullptr, nullptr); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), - 20 * 1048576); + virtual int Compare(const Slice& a, const Slice& b) const override { + return BytewiseComparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* s, + const Slice& l) const override { + BytewiseComparator()->FindShortestSeparator(s, l); + } + virtual void FindShortSuccessor(std::string* key) const override { + BytewiseComparator()->FindShortSuccessor(key); + } + }; + Options new_options, options; + NewComparator cmp; + do { + options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + new_options = CurrentOptions(); + new_options.comparator = &cmp; + // only the non-default column family has non-matching comparator + Status s = TryReopenWithColumnFamilies( + {"default", "pikachu"}, std::vector({options, new_options})); + ASSERT_TRUE(!s.ok()); + ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) + << s.ToString(); } while (ChangeCompactOptions()); } -#ifndef ROCKSDB_LITE -static bool Between(uint64_t val, uint64_t low, uint64_t high) { - bool result = (val >= low) && (val <= high); - if (!result) { - fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", - (unsigned long long)(val), - (unsigned long long)(low), - (unsigned long long)(high)); - } - return result; +TEST_F(DBTest, CustomComparator) { + class NumberComparator : public Comparator { + public: + virtual const char* Name() const override { + return "test.NumberComparator"; + } + virtual int Compare(const Slice& a, const Slice& b) const override { + return ToNumber(a) - ToNumber(b); + } + virtual void FindShortestSeparator(std::string* s, + const Slice& l) const override { + ToNumber(*s); // Check format + ToNumber(l); // Check format + } + virtual void FindShortSuccessor(std::string* key) const override { + ToNumber(*key); // Check format + } + + private: + static int ToNumber(const Slice& x) { + // Check that there are no extra characters. + EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']') + << EscapeString(x); + int val; + char ignored; + EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1) + << EscapeString(x); + return val; + } + }; + Options new_options; + NumberComparator cmp; + do { + new_options = CurrentOptions(); + new_options.create_if_missing = true; + new_options.comparator = &cmp; + new_options.write_buffer_size = 4096; // Compact more often + new_options.arena_block_size = 4096; + new_options = CurrentOptions(new_options); + DestroyAndReopen(new_options); + CreateAndReopenWithCF({"pikachu"}, new_options); + ASSERT_OK(Put(1, "[10]", "ten")); + ASSERT_OK(Put(1, "[0x14]", "twenty")); + for (int i = 0; i < 2; i++) { + ASSERT_EQ("ten", Get(1, "[10]")); + ASSERT_EQ("ten", Get(1, "[0xa]")); + ASSERT_EQ("twenty", Get(1, "[20]")); + ASSERT_EQ("twenty", Get(1, "[0x14]")); + ASSERT_EQ("NOT_FOUND", Get(1, "[15]")); + ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]")); + Compact(1, "[0]", "[9999]"); + } + + for (int run = 0; run < 2; run++) { + for (int i = 0; i < 1000; i++) { + char buf[100]; + snprintf(buf, sizeof(buf), "[%d]", i * 10); + ASSERT_OK(Put(1, buf, buf)); + } + Compact(1, "[0]", "[1000000]"); + } + } while (ChangeCompactOptions()); } -TEST_F(DBTest, ApproximateSizesMemTable) { +TEST_F(DBTest, DBOpen_Options) { Options options = CurrentOptions(); - options.write_buffer_size = 100000000; // Large write buffer - options.compression = kNoCompression; - options.create_if_missing = true; - DestroyAndReopen(options); + std::string dbname = test::TmpDir(env_) + "/db_options_test"; + ASSERT_OK(DestroyDB(dbname, options)); - const int N = 128; - Random rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); - } + // Does not exist, and create_if_missing == false: error + DB* db = nullptr; + options.create_if_missing = false; + Status s = DB::Open(options, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); + ASSERT_TRUE(db == nullptr); - uint64_t size; - std::string start = Key(50); - std::string end = Key(60); - Range r(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_GT(size, 6000); - ASSERT_LT(size, 204800); - // Zero if not including mem table - db_->GetApproximateSizes(&r, 1, &size, false); - ASSERT_EQ(size, 0); + // Does not exist, and create_if_missing == true: OK + options.create_if_missing = true; + s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); - start = Key(500); - end = Key(600); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_EQ(size, 0); + delete db; + db = nullptr; - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024))); - } + // Does exist, and error_if_exists == true: error + options.create_if_missing = false; + options.error_if_exists = true; + s = DB::Open(options, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); + ASSERT_TRUE(db == nullptr); - start = Key(500); - end = Key(600); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_EQ(size, 0); + // Does exist, and error_if_exists == false: OK + options.create_if_missing = true; + options.error_if_exists = false; + s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); - start = Key(100); - end = Key(1020); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_GT(size, 6000); + delete db; + db = nullptr; +} - options.max_write_buffer_number = 8; - options.min_write_buffer_number_to_merge = 5; - options.write_buffer_size = 1024 * N; // Not very large +TEST_F(DBTest, DBOpen_Change_NumLevels) { + Options options = CurrentOptions(); + options.create_if_missing = true; DestroyAndReopen(options); + ASSERT_TRUE(db_ != nullptr); + CreateAndReopenWithCF({"pikachu"}, options); - int keys[N * 3]; - for (int i = 0; i < N; i++) { - keys[i * 3] = i * 5; - keys[i * 3 + 1] = i * 5 + 1; - keys[i * 3 + 2] = i * 5 + 2; - } - std::random_shuffle(std::begin(keys), std::end(keys)); - - for (int i = 0; i < N * 3; i++) { - ASSERT_OK(Put(Key(keys[i] + 1000), RandomString(&rnd, 1024))); - } - - start = Key(100); - end = Key(300); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_EQ(size, 0); + ASSERT_OK(Put(1, "a", "123")); + ASSERT_OK(Put(1, "b", "234")); + Flush(1); + MoveFilesToLevel(3, 1); + Close(); - start = Key(1050); - end = Key(1080); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_GT(size, 6000); + options.create_if_missing = false; + options.num_levels = 2; + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr); + ASSERT_TRUE(db_ == nullptr); +} - start = Key(2100); - end = Key(2300); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_EQ(size, 0); +TEST_F(DBTest, DestroyDBMetaDatabase) { + std::string dbname = test::TmpDir(env_) + "/db_meta"; + ASSERT_OK(env_->CreateDirIfMissing(dbname)); + std::string metadbname = MetaDatabaseName(dbname, 0); + ASSERT_OK(env_->CreateDirIfMissing(metadbname)); + std::string metametadbname = MetaDatabaseName(metadbname, 0); + ASSERT_OK(env_->CreateDirIfMissing(metametadbname)); - start = Key(1050); - end = Key(1080); - r = Range(start, end); - uint64_t size_with_mt, size_without_mt; - db_->GetApproximateSizes(&r, 1, &size_with_mt, true); - ASSERT_GT(size_with_mt, 6000); - db_->GetApproximateSizes(&r, 1, &size_without_mt, false); - ASSERT_EQ(size_without_mt, 0); + // Destroy previous versions if they exist. Using the long way. + Options options = CurrentOptions(); + ASSERT_OK(DestroyDB(metametadbname, options)); + ASSERT_OK(DestroyDB(metadbname, options)); + ASSERT_OK(DestroyDB(dbname, options)); - Flush(); + // Setup databases + DB* db = nullptr; + ASSERT_OK(DB::Open(options, dbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(options, metadbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(options, metametadbname, &db)); + delete db; + db = nullptr; - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i + 1000), RandomString(&rnd, 1024))); - } + // Delete databases + ASSERT_OK(DestroyDB(dbname, options)); - start = Key(1050); - end = Key(1080); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size_with_mt, true); - db_->GetApproximateSizes(&r, 1, &size_without_mt, false); - ASSERT_GT(size_with_mt, size_without_mt); - ASSERT_GT(size_without_mt, 6000); + // Check if deletion worked. + options.create_if_missing = false; + ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok()); } -TEST_F(DBTest, ApproximateSizes) { +#ifndef ROCKSDB_LITE +// Check that number of files does not grow when writes are dropped +TEST_F(DBTest, DropWrites) { do { Options options = CurrentOptions(); - options.write_buffer_size = 100000000; // Large write buffer - options.compression = kNoCompression; - options.create_if_missing = true; - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - - ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); + options.env = env_; + options.paranoid_checks = false; + Reopen(options); - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - const int N = 80; - static const int S1 = 100000; - static const int S2 = 105000; // Allow some expansion from metadata - Random rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1))); + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + Compact("a", "z"); + const size_t num_files = CountFiles(); + // Force out-of-space errors + env_->drop_writes_.store(true, std::memory_order_release); + env_->sleep_counter_.Reset(); + env_->no_sleep_ = true; + for (int i = 0; i < 5; i++) { + if (option_config_ != kUniversalCompactionMultiLevel && + option_config_ != kUniversalSubcompactions) { + for (int level = 0; level < dbfull()->NumberLevels(); level++) { + if (level > 0 && level == dbfull()->NumberLevels() - 1) { + break; + } + dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr, + true /* disallow trivial move */); + } + } else { + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } } - // 0 because GetApproximateSizes() does not account for memtable space - ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0)); - - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - ReopenWithColumnFamilies({"default", "pikachu"}, options); - - for (int compact_start = 0; compact_start < N; compact_start += 10) { - for (int i = 0; i < N; i += 10) { - ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i)); - ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1), - S2 * (i + 1))); - ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10)); - } - ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50)); - ASSERT_TRUE( - Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50)); + std::string property_value; + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("5", property_value); - std::string cstart_str = Key(compact_start); - std::string cend_str = Key(compact_start + 9); - Slice cstart = cstart_str; - Slice cend = cend_str; - dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]); - } + env_->drop_writes_.store(false, std::memory_order_release); + ASSERT_LT(CountFiles(), num_files + 3); - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); - } - // ApproximateOffsetOf() is not yet implemented in plain table format. - } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | - kSkipPlainTable | kSkipHashIndex)); + // Check that compaction attempts slept after errors + // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler + // versions + ASSERT_GE(env_->sleep_counter_.Read(), 4); + } while (ChangeCompactOptions()); } -TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) { +// Check background error counter bumped on flush failures. +TEST_F(DBTest, DropWritesFlush) { do { Options options = CurrentOptions(); - options.compression = kNoCompression; - CreateAndReopenWithCF({"pikachu"}, options); + options.env = env_; + options.max_background_flushes = 1; + Reopen(options); - Random rnd(301); - std::string big1 = RandomString(&rnd, 100000); - ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000))); - ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000))); - ASSERT_OK(Put(1, Key(2), big1)); - ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000))); - ASSERT_OK(Put(1, Key(4), big1)); - ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000))); - ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000))); - ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000))); + ASSERT_OK(Put("foo", "v1")); + // Force out-of-space errors + env_->drop_writes_.store(true, std::memory_order_release); - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - ReopenWithColumnFamilies({"default", "pikachu"}, options); + std::string property_value; + // Background error count is 0 now. + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("0", property_value); - ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0)); - ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000)); - ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000)); - ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000)); - ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000)); - ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000)); - ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000)); - ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000)); - ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000)); + dbfull()->TEST_FlushMemTable(true); - ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000)); + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("1", property_value); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - } - // ApproximateOffsetOf() is not yet implemented in plain table format. - } while (ChangeOptions(kSkipPlainTable)); + env_->drop_writes_.store(false, std::memory_order_release); + } while (ChangeCompactOptions()); } #endif // ROCKSDB_LITE -TEST_F(DBTest, IteratorPinsRef) { +// Check that CompactRange() returns failure if there is not enough space left +// on device +TEST_F(DBTest, NoSpaceCompactRange) { do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - Put(1, "foo", "hello"); - - // Get iterator that will yield the current contents of the DB. - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + Options options = CurrentOptions(); + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); - // Write to force compactions - Put(1, "foo", "newvalue1"); - for (int i = 0; i < 100; i++) { - // 100K values - ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v'))); + // generate 5 tables + for (int i = 0; i < 5; ++i) { + ASSERT_OK(Put(Key(i), Key(i) + "v")); + ASSERT_OK(Flush()); } - Put(1, "foo", "newvalue2"); - iter->SeekToFirst(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo", iter->key().ToString()); - ASSERT_EQ("hello", iter->value().ToString()); - iter->Next(); - ASSERT_TRUE(!iter->Valid()); - delete iter; + // Force out-of-space errors + env_->no_space_.store(true, std::memory_order_release); + + Status s = dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow trivial move */); + ASSERT_TRUE(s.IsIOError()); + + env_->no_space_.store(false, std::memory_order_release); } while (ChangeCompactOptions()); } -#ifndef ROCKSDB_LITE -TEST_F(DBTest, Snapshot) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; +TEST_F(DBTest, NonWritableFileSystem) { do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); - Put(0, "foo", "0v1"); - Put(1, "foo", "1v1"); + Options options = CurrentOptions(); + options.write_buffer_size = 4096; + options.arena_block_size = 4096; + options.env = env_; + Reopen(options); + ASSERT_OK(Put("foo", "v1")); + env_->non_writeable_rate_.store(100); + std::string big(100000, 'x'); + int errors = 0; + for (int i = 0; i < 20; i++) { + if (!Put("foo", big).ok()) { + errors++; + env_->SleepForMicroseconds(100000); + } + } + ASSERT_GT(errors, 0); + env_->non_writeable_rate_.store(0); + } while (ChangeCompactOptions()); +} - const Snapshot* s1 = db_->GetSnapshot(); - ASSERT_EQ(1U, GetNumSnapshots()); - uint64_t time_snap1 = GetTimeOldestSnapshots(); - ASSERT_GT(time_snap1, 0U); - Put(0, "foo", "0v2"); - Put(1, "foo", "1v2"); +#ifndef ROCKSDB_LITE +TEST_F(DBTest, ManifestWriteError) { + // Test for the following problem: + // (a) Compaction produces file F + // (b) Log record containing F is written to MANIFEST file, but Sync() fails + // (c) GC deletes F + // (d) After reopening DB, reads fail since deleted F is named in log record - env_->addon_time_.fetch_add(1); + // We iterate twice. In the second iteration, everything is the + // same except the log record never makes it to the MANIFEST file. + for (int iter = 0; iter < 2; iter++) { + std::atomic* error_type = (iter == 0) ? &env_->manifest_sync_error_ + : &env_->manifest_write_error_; - const Snapshot* s2 = db_->GetSnapshot(); - ASSERT_EQ(2U, GetNumSnapshots()); - ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); - Put(0, "foo", "0v3"); - Put(1, "foo", "1v3"); + // Insert foo=>bar mapping + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_EQ("bar", Get("foo")); - { - ManagedSnapshot s3(db_); - ASSERT_EQ(3U, GetNumSnapshots()); - ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + // Memtable compaction (will succeed) + Flush(); + ASSERT_EQ("bar", Get("foo")); + const int last = 2; + MoveFilesToLevel(2); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level - Put(0, "foo", "0v4"); - Put(1, "foo", "1v4"); - ASSERT_EQ("0v1", Get(0, "foo", s1)); - ASSERT_EQ("1v1", Get(1, "foo", s1)); - ASSERT_EQ("0v2", Get(0, "foo", s2)); - ASSERT_EQ("1v2", Get(1, "foo", s2)); - ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot())); - ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot())); - ASSERT_EQ("0v4", Get(0, "foo")); - ASSERT_EQ("1v4", Get(1, "foo")); - } - - ASSERT_EQ(2U, GetNumSnapshots()); - ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); - ASSERT_EQ("0v1", Get(0, "foo", s1)); - ASSERT_EQ("1v1", Get(1, "foo", s1)); - ASSERT_EQ("0v2", Get(0, "foo", s2)); - ASSERT_EQ("1v2", Get(1, "foo", s2)); - ASSERT_EQ("0v4", Get(0, "foo")); - ASSERT_EQ("1v4", Get(1, "foo")); + // Merging compaction (will fail) + error_type->store(true, std::memory_order_release); + dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + ASSERT_EQ("bar", Get("foo")); - db_->ReleaseSnapshot(s1); - ASSERT_EQ("0v2", Get(0, "foo", s2)); - ASSERT_EQ("1v2", Get(1, "foo", s2)); - ASSERT_EQ("0v4", Get(0, "foo")); - ASSERT_EQ("1v4", Get(1, "foo")); - ASSERT_EQ(1U, GetNumSnapshots()); - ASSERT_LT(time_snap1, GetTimeOldestSnapshots()); + error_type->store(false, std::memory_order_release); - db_->ReleaseSnapshot(s2); - ASSERT_EQ(0U, GetNumSnapshots()); - ASSERT_EQ("0v4", Get(0, "foo")); - ASSERT_EQ("1v4", Get(1, "foo")); - } while (ChangeOptions(kSkipHashCuckoo)); -} + // Since paranoid_checks=true, writes should fail + ASSERT_NOK(Put("foo2", "bar2")); -TEST_F(DBTest, HiddenValuesAreRemoved) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - do { - Options options = CurrentOptions(options_override); - CreateAndReopenWithCF({"pikachu"}, options); - Random rnd(301); - FillLevels("a", "z", 1); + // Recovery: should not lose data + ASSERT_EQ("bar", Get("foo")); - std::string big = RandomString(&rnd, 50000); - Put(1, "foo", big); - Put(1, "pastfoo", "v"); - const Snapshot* snapshot = db_->GetSnapshot(); - Put(1, "foo", "tiny"); - Put(1, "pastfoo2", "v2"); // Advance sequence number one more + // Try again with paranoid_checks=false + Close(); + options.paranoid_checks = false; + Reopen(options); - ASSERT_OK(Flush(1)); - ASSERT_GT(NumTableFilesAtLevel(0, 1), 0); + // Merging compaction (will fail) + error_type->store(true, std::memory_order_release); + dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + ASSERT_EQ("bar", Get("foo")); - ASSERT_EQ(big, Get(1, "foo", snapshot)); - ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000)); - db_->ReleaseSnapshot(snapshot); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]"); - Slice x("x"); - dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - ASSERT_GE(NumTableFilesAtLevel(1, 1), 1); - dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); + // Recovery: should not lose data + error_type->store(false, std::memory_order_release); + Reopen(options); + ASSERT_EQ("bar", Get("foo")); - ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000)); - // ApproximateOffsetOf() is not yet implemented in plain table format, - // which is used by Size(). - // skip HashCuckooRep as it does not support snapshot - } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | - kSkipPlainTable | kSkipHashCuckoo)); + // Since paranoid_checks=false, writes should succeed + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ("bar2", Get("foo2")); + } } #endif // ROCKSDB_LITE -TEST_F(DBTest, CompactBetweenSnapshots) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - do { - Options options = CurrentOptions(options_override); - options.disable_auto_compactions = true; - CreateAndReopenWithCF({"pikachu"}, options); - Random rnd(301); - FillLevels("a", "z", 1); - - Put(1, "foo", "first"); - const Snapshot* snapshot1 = db_->GetSnapshot(); - Put(1, "foo", "second"); - Put(1, "foo", "third"); - Put(1, "foo", "fourth"); - const Snapshot* snapshot2 = db_->GetSnapshot(); - Put(1, "foo", "fifth"); - Put(1, "foo", "sixth"); - - // All entries (including duplicates) exist - // before any compaction or flush is triggered. - ASSERT_EQ(AllEntriesFor("foo", 1), - "[ sixth, fifth, fourth, third, second, first ]"); - ASSERT_EQ("sixth", Get(1, "foo")); - ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); - ASSERT_EQ("first", Get(1, "foo", snapshot1)); +TEST_F(DBTest, PutFailsParanoid) { + // Test the following: + // (a) A random put fails in paranoid mode (simulate by sync fail) + // (b) All other puts have to fail, even if writes would succeed + // (c) All of that should happen ONLY if paranoid_checks = true - // After a flush, "second", "third" and "fifth" should - // be removed - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]"); + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + Status s; - // after we release the snapshot1, only two values left - db_->ReleaseSnapshot(snapshot1); - FillLevels("a", "z", 1); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + // simulate error + env_->log_write_error_.store(true, std::memory_order_release); + s = Put(1, "foo2", "bar2"); + ASSERT_TRUE(!s.ok()); + env_->log_write_error_.store(false, std::memory_order_release); + s = Put(1, "foo3", "bar3"); + // the next put should fail, too + ASSERT_TRUE(!s.ok()); + // but we're still able to read + ASSERT_EQ("bar", Get(1, "foo")); - // We have only one valid snapshot snapshot2. Since snapshot1 is - // not valid anymore, "first" should be removed by a compaction. - ASSERT_EQ("sixth", Get(1, "foo")); - ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]"); + // do the same thing with paranoid checks off + options.paranoid_checks = false; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); - // after we release the snapshot2, only one value should be left - db_->ReleaseSnapshot(snapshot2); - FillLevels("a", "z", 1); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ("sixth", Get(1, "foo")); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]"); - // skip HashCuckooRep as it does not support snapshot - } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction)); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + // simulate error + env_->log_write_error_.store(true, std::memory_order_release); + s = Put(1, "foo2", "bar2"); + ASSERT_TRUE(!s.ok()); + env_->log_write_error_.store(false, std::memory_order_release); + s = Put(1, "foo3", "bar3"); + // the next put should NOT fail + ASSERT_TRUE(s.ok()); } -TEST_F(DBTest, UnremovableSingleDelete) { - // If we compact: - // - // Put(A, v1) Snapshot SingleDelete(A) Put(A, v2) - // - // We do not want to end up with: - // - // Put(A, v1) Snapshot Put(A, v2) - // - // Because a subsequent SingleDelete(A) would delete the Put(A, v2) - // but not Put(A, v1), so Get(A) would return v1. - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; +#ifndef ROCKSDB_LITE +TEST_F(DBTest, SnapshotFiles) { do { - Options options = CurrentOptions(options_override); - options.disable_auto_compactions = true; + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "foo", "first"); - const Snapshot* snapshot = db_->GetSnapshot(); - SingleDelete(1, "foo"); - Put(1, "foo", "second"); - ASSERT_OK(Flush(1)); + Random rnd(301); - ASSERT_EQ("first", Get(1, "foo", snapshot)); - ASSERT_EQ("second", Get(1, "foo")); + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put((i < 40), Key(i), values[i])); + } - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1)); + // assert that nothing makes it to disk yet. + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - SingleDelete(1, "foo"); + // get a file snapshot + uint64_t manifest_number = 0; + uint64_t manifest_size = 0; + std::vector files; + dbfull()->DisableFileDeletions(); + dbfull()->GetLiveFiles(files, &manifest_size); - ASSERT_EQ("first", Get(1, "foo", snapshot)); - ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + // CURRENT, MANIFEST, *.sst files (one for each CF) + ASSERT_EQ(files.size(), 4U); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + uint64_t number = 0; + FileType type; - ASSERT_EQ("first", Get(1, "foo", snapshot)); - ASSERT_EQ("NOT_FOUND", Get(1, "foo")); - db_->ReleaseSnapshot(snapshot); - // Skip HashCuckooRep as it does not support single delete. FIFO and - // universal compaction do not apply to the test case. Skip MergePut - // because single delete does not get removed when it encounters a merge. - } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction | - kSkipUniversalCompaction | kSkipMergePut)); -} + // copy these files to a new snapshot directory + std::string snapdir = dbname_ + ".snapdir/"; + ASSERT_OK(env_->CreateDirIfMissing(snapdir)); -#ifndef ROCKSDB_LITE -TEST_F(DBTest, DeletionMarkers1) { - Options options = CurrentOptions(); - options.max_background_flushes = 0; - CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "foo", "v1"); - ASSERT_OK(Flush(1)); - const int last = 2; - MoveFilesToLevel(last, 1); - // foo => v1 is now in last level - ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + for (size_t i = 0; i < files.size(); i++) { + // our clients require that GetLiveFiles returns + // files with "/" as first character! + ASSERT_EQ(files[i][0], '/'); + std::string src = dbname_ + files[i]; + std::string dest = snapdir + files[i]; - // Place a table at level last-1 to prevent merging with preceding mutation - Put(1, "a", "begin"); - Put(1, "z", "end"); - Flush(1); - MoveFilesToLevel(last - 1, 1); - ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); - ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); - - Delete(1, "foo"); - Put(1, "foo", "v2"); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); - ASSERT_OK(Flush(1)); // Moves to level last-2 - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); - Slice z("z"); - dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]); - // DEL eliminated, but v1 remains because we aren't compacting that level - // (DEL can be eliminated because v2 hides v1). - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); - dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); - // Merging last-1 w/ last, so we are the base level for "foo", so - // DEL is removed. (as is v1). - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); -} - -TEST_F(DBTest, DeletionMarkers2) { - Options options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "foo", "v1"); - ASSERT_OK(Flush(1)); - const int last = 2; - MoveFilesToLevel(last, 1); - // foo => v1 is now in last level - ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); - - // Place a table at level last-1 to prevent merging with preceding mutation - Put(1, "a", "begin"); - Put(1, "z", "end"); - Flush(1); - MoveFilesToLevel(last - 1, 1); - ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); - ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); - - Delete(1, "foo"); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); - ASSERT_OK(Flush(1)); // Moves to level last-2 - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]); - // DEL kept: "last" file overlaps - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); - // Merging last-1 w/ last, so we are the base level for "foo", so - // DEL is removed. (as is v1). - ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); -} - -TEST_F(DBTest, OverlapInLevel0) { - do { - Options options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu"}, options); - - //Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. - ASSERT_OK(Put(1, "100", "v100")); - ASSERT_OK(Put(1, "999", "v999")); - Flush(1); - MoveFilesToLevel(2, 1); - ASSERT_OK(Delete(1, "100")); - ASSERT_OK(Delete(1, "999")); - Flush(1); - MoveFilesToLevel(1, 1); - ASSERT_EQ("0,1,1", FilesPerLevel(1)); - - // Make files spanning the following ranges in level-0: - // files[0] 200 .. 900 - // files[1] 300 .. 500 - // Note that files are sorted by smallest key. - ASSERT_OK(Put(1, "300", "v300")); - ASSERT_OK(Put(1, "500", "v500")); - Flush(1); - ASSERT_OK(Put(1, "200", "v200")); - ASSERT_OK(Put(1, "600", "v600")); - ASSERT_OK(Put(1, "900", "v900")); - Flush(1); - ASSERT_EQ("2,1,1", FilesPerLevel(1)); - - // Compact away the placeholder files we created initially - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); - dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]); - ASSERT_EQ("2", FilesPerLevel(1)); - - // Do a memtable compaction. Before bug-fix, the compaction would - // not detect the overlap with level-0 files and would incorrectly place - // the deletion in a deeper level. - ASSERT_OK(Delete(1, "600")); - Flush(1); - ASSERT_EQ("3", FilesPerLevel(1)); - ASSERT_EQ("NOT_FOUND", Get(1, "600")); - } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); -} -#endif // ROCKSDB_LITE - -TEST_F(DBTest, ComparatorCheck) { - class NewComparator : public Comparator { - public: - virtual const char* Name() const override { - return "rocksdb.NewComparator"; - } - virtual int Compare(const Slice& a, const Slice& b) const override { - return BytewiseComparator()->Compare(a, b); - } - virtual void FindShortestSeparator(std::string* s, - const Slice& l) const override { - BytewiseComparator()->FindShortestSeparator(s, l); - } - virtual void FindShortSuccessor(std::string* key) const override { - BytewiseComparator()->FindShortSuccessor(key); - } - }; - Options new_options, options; - NewComparator cmp; - do { - options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu"}, options); - new_options = CurrentOptions(); - new_options.comparator = &cmp; - // only the non-default column family has non-matching comparator - Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, - std::vector({options, new_options})); - ASSERT_TRUE(!s.ok()); - ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) - << s.ToString(); - } while (ChangeCompactOptions()); -} - -TEST_F(DBTest, CustomComparator) { - class NumberComparator : public Comparator { - public: - virtual const char* Name() const override { - return "test.NumberComparator"; - } - virtual int Compare(const Slice& a, const Slice& b) const override { - return ToNumber(a) - ToNumber(b); - } - virtual void FindShortestSeparator(std::string* s, - const Slice& l) const override { - ToNumber(*s); // Check format - ToNumber(l); // Check format - } - virtual void FindShortSuccessor(std::string* key) const override { - ToNumber(*key); // Check format - } - private: - static int ToNumber(const Slice& x) { - // Check that there are no extra characters. - EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']') - << EscapeString(x); - int val; - char ignored; - EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1) - << EscapeString(x); - return val; - } - }; - Options new_options; - NumberComparator cmp; - do { - new_options = CurrentOptions(); - new_options.create_if_missing = true; - new_options.comparator = &cmp; - new_options.write_buffer_size = 4096; // Compact more often - new_options.arena_block_size = 4096; - new_options = CurrentOptions(new_options); - DestroyAndReopen(new_options); - CreateAndReopenWithCF({"pikachu"}, new_options); - ASSERT_OK(Put(1, "[10]", "ten")); - ASSERT_OK(Put(1, "[0x14]", "twenty")); - for (int i = 0; i < 2; i++) { - ASSERT_EQ("ten", Get(1, "[10]")); - ASSERT_EQ("ten", Get(1, "[0xa]")); - ASSERT_EQ("twenty", Get(1, "[20]")); - ASSERT_EQ("twenty", Get(1, "[0x14]")); - ASSERT_EQ("NOT_FOUND", Get(1, "[15]")); - ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]")); - Compact(1, "[0]", "[9999]"); - } - - for (int run = 0; run < 2; run++) { - for (int i = 0; i < 1000; i++) { - char buf[100]; - snprintf(buf, sizeof(buf), "[%d]", i*10); - ASSERT_OK(Put(1, buf, buf)); - } - Compact(1, "[0]", "[1000000]"); - } - } while (ChangeCompactOptions()); -} - -TEST_F(DBTest, DBOpen_Options) { - Options options = CurrentOptions(); - std::string dbname = test::TmpDir(env_) + "/db_options_test"; - ASSERT_OK(DestroyDB(dbname, options)); - - // Does not exist, and create_if_missing == false: error - DB* db = nullptr; - options.create_if_missing = false; - Status s = DB::Open(options, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); - ASSERT_TRUE(db == nullptr); - - // Does not exist, and create_if_missing == true: OK - options.create_if_missing = true; - s = DB::Open(options, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != nullptr); - - delete db; - db = nullptr; - - // Does exist, and error_if_exists == true: error - options.create_if_missing = false; - options.error_if_exists = true; - s = DB::Open(options, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); - ASSERT_TRUE(db == nullptr); - - // Does exist, and error_if_exists == false: OK - options.create_if_missing = true; - options.error_if_exists = false; - s = DB::Open(options, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != nullptr); - - delete db; - db = nullptr; -} - -TEST_F(DBTest, DBOpen_Change_NumLevels) { - Options options = CurrentOptions(); - options.create_if_missing = true; - DestroyAndReopen(options); - ASSERT_TRUE(db_ != nullptr); - CreateAndReopenWithCF({"pikachu"}, options); - - ASSERT_OK(Put(1, "a", "123")); - ASSERT_OK(Put(1, "b", "234")); - Flush(1); - MoveFilesToLevel(3, 1); - Close(); - - options.create_if_missing = false; - options.num_levels = 2; - Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr); - ASSERT_TRUE(db_ == nullptr); -} - -TEST_F(DBTest, DestroyDBMetaDatabase) { - std::string dbname = test::TmpDir(env_) + "/db_meta"; - ASSERT_OK(env_->CreateDirIfMissing(dbname)); - std::string metadbname = MetaDatabaseName(dbname, 0); - ASSERT_OK(env_->CreateDirIfMissing(metadbname)); - std::string metametadbname = MetaDatabaseName(metadbname, 0); - ASSERT_OK(env_->CreateDirIfMissing(metametadbname)); - - // Destroy previous versions if they exist. Using the long way. - Options options = CurrentOptions(); - ASSERT_OK(DestroyDB(metametadbname, options)); - ASSERT_OK(DestroyDB(metadbname, options)); - ASSERT_OK(DestroyDB(dbname, options)); - - // Setup databases - DB* db = nullptr; - ASSERT_OK(DB::Open(options, dbname, &db)); - delete db; - db = nullptr; - ASSERT_OK(DB::Open(options, metadbname, &db)); - delete db; - db = nullptr; - ASSERT_OK(DB::Open(options, metametadbname, &db)); - delete db; - db = nullptr; - - // Delete databases - ASSERT_OK(DestroyDB(dbname, options)); - - // Check if deletion worked. - options.create_if_missing = false; - ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok()); - ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok()); - ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok()); -} - -#ifndef ROCKSDB_LITE -// Check that number of files does not grow when writes are dropped -TEST_F(DBTest, DropWrites) { - do { - Options options = CurrentOptions(); - options.env = env_; - options.paranoid_checks = false; - Reopen(options); - - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - Compact("a", "z"); - const size_t num_files = CountFiles(); - // Force out-of-space errors - env_->drop_writes_.store(true, std::memory_order_release); - env_->sleep_counter_.Reset(); - env_->no_sleep_ = true; - for (int i = 0; i < 5; i++) { - if (option_config_ != kUniversalCompactionMultiLevel && - option_config_ != kUniversalSubcompactions) { - for (int level = 0; level < dbfull()->NumberLevels(); level++) { - if (level > 0 && level == dbfull()->NumberLevels() - 1) { - break; - } - dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr, - true /* disallow trivial move */); - } - } else { - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - } - } - - std::string property_value; - ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); - ASSERT_EQ("5", property_value); - - env_->drop_writes_.store(false, std::memory_order_release); - ASSERT_LT(CountFiles(), num_files + 3); - - // Check that compaction attempts slept after errors - // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler - // versions - ASSERT_GE(env_->sleep_counter_.Read(), 4); - } while (ChangeCompactOptions()); -} - -// Check background error counter bumped on flush failures. -TEST_F(DBTest, DropWritesFlush) { - do { - Options options = CurrentOptions(); - options.env = env_; - options.max_background_flushes = 1; - Reopen(options); - - ASSERT_OK(Put("foo", "v1")); - // Force out-of-space errors - env_->drop_writes_.store(true, std::memory_order_release); - - std::string property_value; - // Background error count is 0 now. - ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); - ASSERT_EQ("0", property_value); - - dbfull()->TEST_FlushMemTable(true); - - ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); - ASSERT_EQ("1", property_value); - - env_->drop_writes_.store(false, std::memory_order_release); - } while (ChangeCompactOptions()); -} -#endif // ROCKSDB_LITE - -// Check that CompactRange() returns failure if there is not enough space left -// on device -TEST_F(DBTest, NoSpaceCompactRange) { - do { - Options options = CurrentOptions(); - options.env = env_; - options.disable_auto_compactions = true; - Reopen(options); - - // generate 5 tables - for (int i = 0; i < 5; ++i) { - ASSERT_OK(Put(Key(i), Key(i) + "v")); - ASSERT_OK(Flush()); - } - - // Force out-of-space errors - env_->no_space_.store(true, std::memory_order_release); - - Status s = dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow trivial move */); - ASSERT_TRUE(s.IsIOError()); - - env_->no_space_.store(false, std::memory_order_release); - } while (ChangeCompactOptions()); -} - -TEST_F(DBTest, NonWritableFileSystem) { - do { - Options options = CurrentOptions(); - options.write_buffer_size = 4096; - options.arena_block_size = 4096; - options.env = env_; - Reopen(options); - ASSERT_OK(Put("foo", "v1")); - env_->non_writeable_rate_.store(100); - std::string big(100000, 'x'); - int errors = 0; - for (int i = 0; i < 20; i++) { - if (!Put("foo", big).ok()) { - errors++; - env_->SleepForMicroseconds(100000); - } - } - ASSERT_GT(errors, 0); - env_->non_writeable_rate_.store(0); - } while (ChangeCompactOptions()); -} - -#ifndef ROCKSDB_LITE -TEST_F(DBTest, ManifestWriteError) { - // Test for the following problem: - // (a) Compaction produces file F - // (b) Log record containing F is written to MANIFEST file, but Sync() fails - // (c) GC deletes F - // (d) After reopening DB, reads fail since deleted F is named in log record - - // We iterate twice. In the second iteration, everything is the - // same except the log record never makes it to the MANIFEST file. - for (int iter = 0; iter < 2; iter++) { - std::atomic* error_type = (iter == 0) - ? &env_->manifest_sync_error_ - : &env_->manifest_write_error_; - - // Insert foo=>bar mapping - Options options = CurrentOptions(); - options.env = env_; - options.create_if_missing = true; - options.error_if_exists = false; - options.paranoid_checks = true; - DestroyAndReopen(options); - ASSERT_OK(Put("foo", "bar")); - ASSERT_EQ("bar", Get("foo")); - - // Memtable compaction (will succeed) - Flush(); - ASSERT_EQ("bar", Get("foo")); - const int last = 2; - MoveFilesToLevel(2); - ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level - - // Merging compaction (will fail) - error_type->store(true, std::memory_order_release); - dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail - ASSERT_EQ("bar", Get("foo")); - - error_type->store(false, std::memory_order_release); - - // Since paranoid_checks=true, writes should fail - ASSERT_NOK(Put("foo2", "bar2")); - - // Recovery: should not lose data - ASSERT_EQ("bar", Get("foo")); - - // Try again with paranoid_checks=false - Close(); - options.paranoid_checks = false; - Reopen(options); - - // Merging compaction (will fail) - error_type->store(true, std::memory_order_release); - dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail - ASSERT_EQ("bar", Get("foo")); - - // Recovery: should not lose data - error_type->store(false, std::memory_order_release); - Reopen(options); - ASSERT_EQ("bar", Get("foo")); - - // Since paranoid_checks=false, writes should succeed - ASSERT_OK(Put("foo2", "bar2")); - ASSERT_EQ("bar", Get("foo")); - ASSERT_EQ("bar2", Get("foo2")); - } -} -#endif // ROCKSDB_LITE - -TEST_F(DBTest, PutFailsParanoid) { - // Test the following: - // (a) A random put fails in paranoid mode (simulate by sync fail) - // (b) All other puts have to fail, even if writes would succeed - // (c) All of that should happen ONLY if paranoid_checks = true - - Options options = CurrentOptions(); - options.env = env_; - options.create_if_missing = true; - options.error_if_exists = false; - options.paranoid_checks = true; - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - Status s; - - ASSERT_OK(Put(1, "foo", "bar")); - ASSERT_OK(Put(1, "foo1", "bar1")); - // simulate error - env_->log_write_error_.store(true, std::memory_order_release); - s = Put(1, "foo2", "bar2"); - ASSERT_TRUE(!s.ok()); - env_->log_write_error_.store(false, std::memory_order_release); - s = Put(1, "foo3", "bar3"); - // the next put should fail, too - ASSERT_TRUE(!s.ok()); - // but we're still able to read - ASSERT_EQ("bar", Get(1, "foo")); - - // do the same thing with paranoid checks off - options.paranoid_checks = false; - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - - ASSERT_OK(Put(1, "foo", "bar")); - ASSERT_OK(Put(1, "foo1", "bar1")); - // simulate error - env_->log_write_error_.store(true, std::memory_order_release); - s = Put(1, "foo2", "bar2"); - ASSERT_TRUE(!s.ok()); - env_->log_write_error_.store(false, std::memory_order_release); - s = Put(1, "foo3", "bar3"); - // the next put should NOT fail - ASSERT_TRUE(s.ok()); -} - -TEST_F(DBTest, BloomFilter) { - do { - Options options = CurrentOptions(); - env_->count_random_reads_ = true; - options.env = env_; - // ChangeCompactOptions() only changes compaction style, which does not - // trigger reset of table_factory - BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.filter_policy.reset(NewBloomFilterPolicy(10)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - CreateAndReopenWithCF({"pikachu"}, options); - - // Populate multiple layers - const int N = 10000; - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - Compact(1, "a", "z"); - for (int i = 0; i < N; i += 100) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - Flush(1); - - // Prevent auto compactions triggered by seeks - env_->delay_sstable_sync_.store(true, std::memory_order_release); - - // Lookup present keys. Should rarely read from small sstable. - env_->random_read_counter_.Reset(); - for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - int reads = env_->random_read_counter_.Read(); - fprintf(stderr, "%d present => %d reads\n", N, reads); - ASSERT_GE(reads, N); - ASSERT_LE(reads, N + 2*N/100); - - // Lookup present keys. Should rarely read from either sstable. - env_->random_read_counter_.Reset(); - for (int i = 0; i < N; i++) { - ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing")); - } - reads = env_->random_read_counter_.Read(); - fprintf(stderr, "%d missing => %d reads\n", N, reads); - ASSERT_LE(reads, 3*N/100); - - env_->delay_sstable_sync_.store(false, std::memory_order_release); - Close(); - } while (ChangeCompactOptions()); -} - -TEST_F(DBTest, BloomFilterRate) { - while (ChangeFilterOptions()) { - Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - CreateAndReopenWithCF({"pikachu"}, options); - - const int maxKey = 10000; - for (int i = 0; i < maxKey; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - // Add a large key to make the file contain wide range - ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - Flush(1); - - // Check if they can be found - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - - // Check if filter is useful - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333))); - } - ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98); - } -} - -TEST_F(DBTest, BloomFilterCompatibility) { - Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - BlockBasedTableOptions table_options; - table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - // Create with block based filter - CreateAndReopenWithCF({"pikachu"}, options); - - const int maxKey = 10000; - for (int i = 0; i < maxKey; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - Flush(1); - - // Check db with full filter - table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - - // Check if they can be found - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); -} - -TEST_F(DBTest, BloomFilterReverseCompatibility) { - Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - BlockBasedTableOptions table_options; - table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - // Create with full filter - CreateAndReopenWithCF({"pikachu"}, options); - - const int maxKey = 10000; - for (int i = 0; i < maxKey; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - Flush(1); - - // Check db with block_based filter - table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - - // Check if they can be found - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); -} - -namespace { -// A wrapped bloom over default FilterPolicy -class WrappedBloom : public FilterPolicy { - public: - explicit WrappedBloom(int bits_per_key) : - filter_(NewBloomFilterPolicy(bits_per_key)), - counter_(0) {} - - ~WrappedBloom() { delete filter_; } - - const char* Name() const override { return "WrappedRocksDbFilterPolicy"; } - - void CreateFilter(const rocksdb::Slice* keys, int n, std::string* dst) - const override { - std::unique_ptr user_keys(new rocksdb::Slice[n]); - for (int i = 0; i < n; ++i) { - user_keys[i] = convertKey(keys[i]); - } - return filter_->CreateFilter(user_keys.get(), n, dst); - } - - bool KeyMayMatch(const rocksdb::Slice& key, const rocksdb::Slice& filter) - const override { - counter_++; - return filter_->KeyMayMatch(convertKey(key), filter); - } - - uint32_t GetCounter() { return counter_; } - - private: - const FilterPolicy* filter_; - mutable uint32_t counter_; - - rocksdb::Slice convertKey(const rocksdb::Slice& key) const { - return key; - } -}; -} // namespace - -TEST_F(DBTest, BloomFilterWrapper) { - Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - - BlockBasedTableOptions table_options; - WrappedBloom* policy = new WrappedBloom(10); - table_options.filter_policy.reset(policy); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - CreateAndReopenWithCF({"pikachu"}, options); - - const int maxKey = 10000; - for (int i = 0; i < maxKey; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - // Add a large key to make the file contain wide range - ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - ASSERT_EQ(0U, policy->GetCounter()); - Flush(1); - - // Check if they can be found - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ(1U * maxKey, policy->GetCounter()); - - // Check if filter is useful - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333))); - } - ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98); - ASSERT_EQ(2U * maxKey, policy->GetCounter()); -} - -#ifndef ROCKSDB_LITE -TEST_F(DBTest, SnapshotFiles) { - do { - Options options = CurrentOptions(); - options.write_buffer_size = 100000000; // Large write buffer - CreateAndReopenWithCF({"pikachu"}, options); - - Random rnd(301); - - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - std::vector values; - for (int i = 0; i < 80; i++) { - values.push_back(RandomString(&rnd, 100000)); - ASSERT_OK(Put((i < 40), Key(i), values[i])); - } - - // assert that nothing makes it to disk yet. - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - - // get a file snapshot - uint64_t manifest_number = 0; - uint64_t manifest_size = 0; - std::vector files; - dbfull()->DisableFileDeletions(); - dbfull()->GetLiveFiles(files, &manifest_size); - - // CURRENT, MANIFEST, *.sst files (one for each CF) - ASSERT_EQ(files.size(), 4U); - - uint64_t number = 0; - FileType type; - - // copy these files to a new snapshot directory - std::string snapdir = dbname_ + ".snapdir/"; - ASSERT_OK(env_->CreateDirIfMissing(snapdir)); - - for (size_t i = 0; i < files.size(); i++) { - // our clients require that GetLiveFiles returns - // files with "/" as first character! - ASSERT_EQ(files[i][0], '/'); - std::string src = dbname_ + files[i]; - std::string dest = snapdir + files[i]; - - uint64_t size; - ASSERT_OK(env_->GetFileSize(src, &size)); - - // record the number and the size of the - // latest manifest file - if (ParseFileName(files[i].substr(1), &number, &type)) { - if (type == kDescriptorFile) { - if (number > manifest_number) { - manifest_number = number; - ASSERT_GE(size, manifest_size); - size = manifest_size; // copy only valid MANIFEST data - } - } - } - CopyFile(src, dest, size); - } - - // release file snapshot - dbfull()->DisableFileDeletions(); - // overwrite one key, this key should not appear in the snapshot - std::vector extras; - for (unsigned int i = 0; i < 1; i++) { - extras.push_back(RandomString(&rnd, 100000)); - ASSERT_OK(Put(0, Key(i), extras[i])); - } - - // verify that data in the snapshot are correct - std::vector column_families; - column_families.emplace_back("default", ColumnFamilyOptions()); - column_families.emplace_back("pikachu", ColumnFamilyOptions()); - std::vector cf_handles; - DB* snapdb; - DBOptions opts; - opts.env = env_; - opts.create_if_missing = false; - Status stat = - DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb); - ASSERT_OK(stat); - - ReadOptions roptions; - std::string val; - for (unsigned int i = 0; i < 80; i++) { - stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val); - ASSERT_EQ(values[i].compare(val), 0); - } - for (auto cfh : cf_handles) { - delete cfh; - } - delete snapdb; - - // look at the new live files after we added an 'extra' key - // and after we took the first snapshot. - uint64_t new_manifest_number = 0; - uint64_t new_manifest_size = 0; - std::vector newfiles; - dbfull()->DisableFileDeletions(); - dbfull()->GetLiveFiles(newfiles, &new_manifest_size); - - // find the new manifest file. assert that this manifest file is - // the same one as in the previous snapshot. But its size should be - // larger because we added an extra key after taking the - // previous shapshot. - for (size_t i = 0; i < newfiles.size(); i++) { - std::string src = dbname_ + "/" + newfiles[i]; - // record the lognumber and the size of the - // latest manifest file - if (ParseFileName(newfiles[i].substr(1), &number, &type)) { - if (type == kDescriptorFile) { - if (number > new_manifest_number) { - uint64_t size; - new_manifest_number = number; - ASSERT_OK(env_->GetFileSize(src, &size)); - ASSERT_GE(size, new_manifest_size); - } - } - } - } - ASSERT_EQ(manifest_number, new_manifest_number); - ASSERT_GT(new_manifest_size, manifest_size); - - // release file snapshot - dbfull()->DisableFileDeletions(); - } while (ChangeCompactOptions()); -} -#endif - -TEST_F(DBTest, CompactOnFlush) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - do { - Options options = CurrentOptions(options_override); - options.disable_auto_compactions = true; - CreateAndReopenWithCF({"pikachu"}, options); - - Put(1, "foo", "v1"); - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]"); - - // Write two new keys - Put(1, "a", "begin"); - Put(1, "z", "end"); - Flush(1); - - // Case1: Delete followed by a put - Delete(1, "foo"); - Put(1, "foo", "v2"); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); - - // After the current memtable is flushed, the DEL should - // have been removed - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); - - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); - - // Case 2: Delete followed by another delete - Delete(1, "foo"); - Delete(1, "foo"); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]"); - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - - // Case 3: Put followed by a delete - Put(1, "foo", "v3"); - Delete(1, "foo"); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]"); - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - - // Case 4: Put followed by another Put - Put(1, "foo", "v4"); - Put(1, "foo", "v5"); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]"); - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); - - // clear database - Delete(1, "foo"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - - // Case 5: Put followed by snapshot followed by another Put - // Both puts should remain. - Put(1, "foo", "v6"); - const Snapshot* snapshot = db_->GetSnapshot(); - Put(1, "foo", "v7"); - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]"); - db_->ReleaseSnapshot(snapshot); - - // clear database - Delete(1, "foo"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - - // Case 5: snapshot followed by a put followed by another Put - // Only the last put should remain. - const Snapshot* snapshot1 = db_->GetSnapshot(); - Put(1, "foo", "v8"); - Put(1, "foo", "v9"); - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]"); - db_->ReleaseSnapshot(snapshot1); - } while (ChangeCompactOptions()); -} - -namespace { -std::vector ListSpecificFiles( - Env* env, const std::string& path, const FileType expected_file_type) { - std::vector files; - std::vector file_numbers; - env->GetChildren(path, &files); - uint64_t number; - FileType type; - for (size_t i = 0; i < files.size(); ++i) { - if (ParseFileName(files[i], &number, &type)) { - if (type == expected_file_type) { - file_numbers.push_back(number); - } - } - } - return file_numbers; -} - -std::vector ListTableFiles(Env* env, const std::string& path) { - return ListSpecificFiles(env, path, kTableFile); -} -} // namespace - -TEST_F(DBTest, FlushOneColumnFamily) { - Options options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", - "alyosha", "popovich"}, - options); - - ASSERT_OK(Put(0, "Default", "Default")); - ASSERT_OK(Put(1, "pikachu", "pikachu")); - ASSERT_OK(Put(2, "ilya", "ilya")); - ASSERT_OK(Put(3, "muromec", "muromec")); - ASSERT_OK(Put(4, "dobrynia", "dobrynia")); - ASSERT_OK(Put(5, "nikitich", "nikitich")); - ASSERT_OK(Put(6, "alyosha", "alyosha")); - ASSERT_OK(Put(7, "popovich", "popovich")); - - for (int i = 0; i < 8; ++i) { - Flush(i); - auto tables = ListTableFiles(env_, dbname_); - ASSERT_EQ(tables.size(), i + 1U); - } -} - -#ifndef ROCKSDB_LITE -// In https://reviews.facebook.net/D20661 we change -// recovery behavior: previously for each log file each column family -// memtable was flushed, even it was empty. Now it's changed: -// we try to create the smallest number of table files by merging -// updates from multiple logs -TEST_F(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) { - Options options = CurrentOptions(); - options.write_buffer_size = 5000000; - CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); - - // Since we will reopen DB with smaller write_buffer_size, - // each key will go to new SST file - ASSERT_OK(Put(1, Key(10), DummyString(1000000))); - ASSERT_OK(Put(1, Key(10), DummyString(1000000))); - ASSERT_OK(Put(1, Key(10), DummyString(1000000))); - ASSERT_OK(Put(1, Key(10), DummyString(1000000))); - - ASSERT_OK(Put(3, Key(10), DummyString(1))); - // Make 'dobrynia' to be flushed and new WAL file to be created - ASSERT_OK(Put(2, Key(10), DummyString(7500000))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); - { - auto tables = ListTableFiles(env_, dbname_); - ASSERT_EQ(tables.size(), static_cast(1)); - // Make sure 'dobrynia' was flushed: check sst files amount - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(1)); - } - // New WAL file - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(3, Key(10), DummyString(1))); - ASSERT_OK(Put(3, Key(10), DummyString(1))); - ASSERT_OK(Put(3, Key(10), DummyString(1))); - - options.write_buffer_size = 4096; - options.arena_block_size = 4096; - ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, - options); - { - // No inserts => default is empty - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(0)); - // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(5)); - // 1 SST for big key + 1 SST for small one - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(2)); - // 1 SST for all keys - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(1)); - } -} - -// In https://reviews.facebook.net/D20661 we change -// recovery behavior: previously for each log file each column family -// memtable was flushed, even it wasn't empty. Now it's changed: -// we try to create the smallest number of table files by merging -// updates from multiple logs -TEST_F(DBTest, RecoverCheckFileAmount) { - Options options = CurrentOptions(); - options.write_buffer_size = 100000; - options.arena_block_size = 4 * 1024; - CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); - - ASSERT_OK(Put(0, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - - // Make 'nikitich' memtable to be flushed - ASSERT_OK(Put(3, Key(10), DummyString(1002400))); - ASSERT_OK(Put(3, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - // 4 memtable are not flushed, 1 sst file - { - auto tables = ListTableFiles(env_, dbname_); - ASSERT_EQ(tables.size(), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(1)); - } - // Memtable for 'nikitich' has flushed, new WAL file has opened - // 4 memtable still not flushed - - // Write to new WAL file - ASSERT_OK(Put(0, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - - // Fill up 'nikitich' one more time - ASSERT_OK(Put(3, Key(10), DummyString(1002400))); - // make it flush - ASSERT_OK(Put(3, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - // There are still 4 memtable not flushed, and 2 sst tables - ASSERT_OK(Put(0, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - - { - auto tables = ListTableFiles(env_, dbname_); - ASSERT_EQ(tables.size(), static_cast(2)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(2)); - } - - ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, - options); - { - std::vector table_files = ListTableFiles(env_, dbname_); - // Check, that records for 'default', 'dobrynia' and 'pikachu' from - // first, second and third WALs went to the same SST. - // So, there is 6 SSTs: three for 'nikitich', one for 'default', one for - // 'dobrynia', one for 'pikachu' - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(3)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(1)); - } -} - -TEST_F(DBTest, SharedWriteBuffer) { - Options options = CurrentOptions(); - options.db_write_buffer_size = 100000; // this is the real limit - options.write_buffer_size = 500000; // this is never hit - CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); - - // Trigger a flush on CF "nikitich" - ASSERT_OK(Put(0, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(3, Key(1), DummyString(90000))); - ASSERT_OK(Put(2, Key(2), DummyString(20000))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[0]); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(1)); - } - - // "dobrynia": 20KB - // Flush 'dobrynia' - ASSERT_OK(Put(3, Key(2), DummyString(40000))); - ASSERT_OK(Put(2, Key(2), DummyString(70000))); - ASSERT_OK(Put(0, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(1)); - } - - // "nikitich" still has has data of 80KB - // Inserting Data in "dobrynia" triggers "nikitich" flushing. - ASSERT_OK(Put(3, Key(2), DummyString(40000))); - ASSERT_OK(Put(2, Key(2), DummyString(40000))); - ASSERT_OK(Put(0, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(2)); - } - - // "dobrynia" still has 40KB - ASSERT_OK(Put(1, Key(2), DummyString(20000))); - ASSERT_OK(Put(0, Key(1), DummyString(10000))); - ASSERT_OK(Put(0, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[0]); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - // This should triggers no flush - { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(2)); - } - - // "default": 10KB, "pikachu": 20KB, "dobrynia": 40KB - ASSERT_OK(Put(1, Key(2), DummyString(40000))); - ASSERT_OK(Put(0, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[0]); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - // This should triggers flush of "pikachu" - { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(2)); - } - - // "default": 10KB, "dobrynia": 40KB - // Some remaining writes so 'default', 'dobrynia' and 'nikitich' flush on - // closure. - ASSERT_OK(Put(3, Key(1), DummyString(1))); - ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, - options); - { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(2)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(3)); - } -} -#endif // ROCKSDB_LITE - -TEST_F(DBTest, PurgeInfoLogs) { - Options options = CurrentOptions(); - options.keep_log_file_num = 5; - options.create_if_missing = true; - for (int mode = 0; mode <= 1; mode++) { - if (mode == 1) { - options.db_log_dir = dbname_ + "_logs"; - env_->CreateDirIfMissing(options.db_log_dir); - } else { - options.db_log_dir = ""; - } - for (int i = 0; i < 8; i++) { - Reopen(options); - } - - std::vector files; - env_->GetChildren(options.db_log_dir.empty() ? dbname_ : options.db_log_dir, - &files); - int info_log_count = 0; - for (std::string file : files) { - if (file.find("LOG") != std::string::npos) { - info_log_count++; - } - } - ASSERT_EQ(5, info_log_count); - - Destroy(options); - // For mode (1), test DestroyDB() to delete all the logs under DB dir. - // For mode (2), no info log file should have been put under DB dir. - std::vector db_files; - env_->GetChildren(dbname_, &db_files); - for (std::string file : db_files) { - ASSERT_TRUE(file.find("LOG") == std::string::npos); - } - - if (mode == 1) { - // Cleaning up - env_->GetChildren(options.db_log_dir, &files); - for (std::string file : files) { - env_->DeleteFile(options.db_log_dir + "/" + file); - } - env_->DeleteDir(options.db_log_dir); - } - } -} - -TEST_F(DBTest, SyncMultipleLogs) { - const uint64_t kNumBatches = 2; - const int kBatchSize = 1000; - - Options options = CurrentOptions(); - options.create_if_missing = true; - options.write_buffer_size = 4096; - Reopen(options); - - WriteBatch batch; - WriteOptions wo; - wo.sync = true; - - for (uint64_t b = 0; b < kNumBatches; b++) { - batch.Clear(); - for (int i = 0; i < kBatchSize; i++) { - batch.Put(Key(i), DummyString(128)); - } - - dbfull()->Write(wo, &batch); - } - - ASSERT_OK(dbfull()->SyncWAL()); -} - -#ifndef ROCKSDB_LITE -// -// Test WAL recovery for the various modes available -// -class RecoveryTestHelper { - public: - // Number of WAL files to generate - static const int kWALFilesCount = 10; - // Starting number for the WAL file name like 00010.log - static const int kWALFileOffset = 10; - // Keys to be written per WAL file - static const int kKeysPerWALFile = 1024; - // Size of the value - static const int kValueSize = 10; - - // Create WAL files with values filled in - static void FillData(DBTest* test, Options& options, const size_t wal_count, - size_t& count) { - DBOptions& db_options = options; - - count = 0; - - shared_ptr table_cache = NewLRUCache(50000, 16); - EnvOptions env_options; - WriteBuffer write_buffer(db_options.db_write_buffer_size); - - unique_ptr versions; - unique_ptr wal_manager; - WriteController write_controller; - - versions.reset(new VersionSet(test->dbname_, &db_options, env_options, - table_cache.get(), &write_buffer, - &write_controller)); - - wal_manager.reset(new WalManager(db_options, env_options)); - - std::unique_ptr current_log_writer; - - for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) { - uint64_t current_log_number = j; - std::string fname = LogFileName(test->dbname_, current_log_number); - unique_ptr file; - ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options)); - unique_ptr file_writer( - new WritableFileWriter(std::move(file), env_options)); - current_log_writer.reset( - new log::Writer(std::move(file_writer), current_log_number, - db_options.recycle_log_file_num > 0)); - - for (int i = 0; i < kKeysPerWALFile; i++) { - std::string key = "key" + ToString(count++); - std::string value = test->DummyString(kValueSize); - assert(current_log_writer.get() != nullptr); - uint64_t seq = versions->LastSequence() + 1; - WriteBatch batch; - batch.Put(key, value); - WriteBatchInternal::SetSequence(&batch, seq); - current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch)); - versions->SetLastSequence(seq); - } - } - } - - // Recreate and fill the store with some data - static size_t FillData(DBTest* test, Options& options) { - options.create_if_missing = true; - test->DestroyAndReopen(options); - test->Close(); - - size_t count = 0; - FillData(test, options, kWALFilesCount, count); - return count; - } - - // Read back all the keys we wrote and return the number of keys found - static size_t GetData(DBTest* test) { - size_t count = 0; - for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) { - if (test->Get("key" + ToString(i)) != "NOT_FOUND") { - ++count; - } - } - return count; - } - - // Manuall corrupt the specified WAL - static void CorruptWAL(DBTest* test, Options& options, const double off, - const double len, const int wal_file_id, - const bool trunc = false) { - Env* env = options.env; - std::string fname = LogFileName(test->dbname_, wal_file_id); - uint64_t size; - ASSERT_OK(env->GetFileSize(fname, &size)); - ASSERT_GT(size, 0); -#ifdef OS_WIN - // Windows disk cache behaves differently. When we truncate - // the original content is still in the cache due to the original - // handle is still open. Generally, in Windows, one prohibits - // shared access to files and it is not needed for WAL but we allow - // it to induce corruption at various tests. - test->Close(); -#endif - if (trunc) { - ASSERT_EQ(0, truncate(fname.c_str(), - static_cast(size * off))); - } else { - InduceCorruption(fname, static_cast(size * off), - static_cast(size * len)); - } - } - - // Overwrite data with 'a' from offset for length len - static void InduceCorruption(const std::string& filename, size_t offset, - size_t len) { - ASSERT_GT(len, 0U); - - int fd = open(filename.c_str(), O_RDWR); - - ASSERT_GT(fd, 0); - ASSERT_EQ(offset, lseek(fd, static_cast(offset), SEEK_SET)); - - void* buf = alloca(len); - memset(buf, 'a', len); - ASSERT_EQ(len, write(fd, buf, static_cast(len))); - - close(fd); - } -}; - -// Test scope: -// - We expect to open the data store when there is incomplete trailing writes -// at the end of any of the logs -// - We do not expect to open the data store for corruption -TEST_F(DBTest, kTolerateCorruptedTailRecords) { - const int jstart = RecoveryTestHelper::kWALFileOffset; - const int jend = jstart + RecoveryTestHelper::kWALFilesCount; - - for (auto trunc : {true, false}) { /* Corruption style */ - for (int i = 0; i < 4; i++) { /* Corruption offset position */ - for (int j = jstart; j < jend; j++) { /* WAL file */ - // Fill data for testing - Options options = CurrentOptions(); - const size_t row_count = RecoveryTestHelper::FillData(this, options); - // test checksum failure or parsing - RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, - /*len%=*/.1, /*wal=*/j, trunc); - - if (trunc) { - options.wal_recovery_mode = - WALRecoveryMode::kTolerateCorruptedTailRecords; - options.create_if_missing = false; - ASSERT_OK(TryReopen(options)); - const size_t recovered_row_count = RecoveryTestHelper::GetData(this); - ASSERT_TRUE(i == 0 || recovered_row_count > 0); - ASSERT_LT(recovered_row_count, row_count); - } else { - options.wal_recovery_mode = - WALRecoveryMode::kTolerateCorruptedTailRecords; - ASSERT_NOK(TryReopen(options)); - } - } - } - } -} - -// Test scope: -// We don't expect the data store to be opened if there is any corruption -// (leading, middle or trailing -- incomplete writes or corruption) -TEST_F(DBTest, kAbsoluteConsistency) { - const int jstart = RecoveryTestHelper::kWALFileOffset; - const int jend = jstart + RecoveryTestHelper::kWALFilesCount; - - // Verify clean slate behavior - Options options = CurrentOptions(); - const size_t row_count = RecoveryTestHelper::FillData(this, options); - options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; - options.create_if_missing = false; - ASSERT_OK(TryReopen(options)); - ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count); - - for (auto trunc : {true, false}) { /* Corruption style */ - for (int i = 0; i < 4; i++) { /* Corruption offset position */ - if (trunc && i == 0) { - continue; - } - - for (int j = jstart; j < jend; j++) { /* wal files */ - // fill with new date - RecoveryTestHelper::FillData(this, options); - // corrupt the wal - RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, - /*len%=*/.1, j, trunc); - // verify - options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; - options.create_if_missing = false; - ASSERT_NOK(TryReopen(options)); - } - } - } -} - -// Test scope: -// - We expect to open data store under all circumstances -// - We expect only data upto the point where the first error was encountered -TEST_F(DBTest, kPointInTimeRecovery) { - const int jstart = RecoveryTestHelper::kWALFileOffset; - const int jend = jstart + RecoveryTestHelper::kWALFilesCount; - const int maxkeys = - RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile; - - for (auto trunc : {true, false}) { /* Corruption style */ - for (int i = 0; i < 4; i++) { /* Offset of corruption */ - for (int j = jstart; j < jend; j++) { /* WAL file */ - // Fill data for testing - Options options = CurrentOptions(); - const size_t row_count = RecoveryTestHelper::FillData(this, options); - - // Corrupt the wal - RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, - /*len%=*/.1, j, trunc); - - // Verify - options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; - options.create_if_missing = false; - ASSERT_OK(TryReopen(options)); - - // Probe data for invariants - size_t recovered_row_count = RecoveryTestHelper::GetData(this); - ASSERT_LT(recovered_row_count, row_count); - - bool expect_data = true; - for (size_t k = 0; k < maxkeys; ++k) { - bool found = Get("key" + ToString(i)) != "NOT_FOUND"; - if (expect_data && !found) { - expect_data = false; - } - ASSERT_EQ(found, expect_data); - } - - const size_t min = RecoveryTestHelper::kKeysPerWALFile * - (j - RecoveryTestHelper::kWALFileOffset); - ASSERT_GE(recovered_row_count, min); - if (!trunc && i != 0) { - const size_t max = RecoveryTestHelper::kKeysPerWALFile * - (j - RecoveryTestHelper::kWALFileOffset + 1); - ASSERT_LE(recovered_row_count, max); - } - } - } - } -} - -// Test scope: -// - We expect to open the data store under all scenarios -// - We expect to have recovered records past the corruption zone -TEST_F(DBTest, kSkipAnyCorruptedRecords) { - const int jstart = RecoveryTestHelper::kWALFileOffset; - const int jend = jstart + RecoveryTestHelper::kWALFilesCount; - - for (auto trunc : {true, false}) { /* Corruption style */ - for (int i = 0; i < 4; i++) { /* Corruption offset */ - for (int j = jstart; j < jend; j++) { /* wal files */ - // Fill data for testing - Options options = CurrentOptions(); - const size_t row_count = RecoveryTestHelper::FillData(this, options); - - // Corrupt the WAL - RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, - /*len%=*/.1, j, trunc); - - // Verify behavior - options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords; - options.create_if_missing = false; - ASSERT_OK(TryReopen(options)); - - // Probe data for invariants - size_t recovered_row_count = RecoveryTestHelper::GetData(this); - ASSERT_LT(recovered_row_count, row_count); - - if (!trunc) { - ASSERT_TRUE(i != 0 || recovered_row_count > 0); - } - } - } - } -} - -// Multi-threaded test: -namespace { - -static const int kColumnFamilies = 10; -static const int kNumThreads = 10; -static const int kTestSeconds = 10; -static const int kNumKeys = 1000; - -struct MTState { - DBTest* test; - std::atomic stop; - std::atomic counter[kNumThreads]; - std::atomic thread_done[kNumThreads]; -}; - -struct MTThread { - MTState* state; - int id; -}; - -static void MTThreadBody(void* arg) { - MTThread* t = reinterpret_cast(arg); - int id = t->id; - DB* db = t->state->test->db_; - int counter = 0; - fprintf(stderr, "... starting thread %d\n", id); - Random rnd(1000 + id); - char valbuf[1500]; - while (t->state->stop.load(std::memory_order_acquire) == false) { - t->state->counter[id].store(counter, std::memory_order_release); - - int key = rnd.Uniform(kNumKeys); - char keybuf[20]; - snprintf(keybuf, sizeof(keybuf), "%016d", key); - - if (rnd.OneIn(2)) { - // Write values of the form . - // into each of the CFs - // We add some padding for force compactions. - int unique_id = rnd.Uniform(1000000); - - // Half of the time directly use WriteBatch. Half of the time use - // WriteBatchWithIndex. - if (rnd.OneIn(2)) { - WriteBatch batch; - for (int cf = 0; cf < kColumnFamilies; ++cf) { - snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, - static_cast(counter), cf, unique_id); - batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); - } - ASSERT_OK(db->Write(WriteOptions(), &batch)); - } else { - WriteBatchWithIndex batch(db->GetOptions().comparator); - for (int cf = 0; cf < kColumnFamilies; ++cf) { - snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, - static_cast(counter), cf, unique_id); - batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); - } - ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch())); - } - } else { - // Read a value and verify that it matches the pattern written above - // and that writes to all column families were atomic (unique_id is the - // same) - std::vector keys(kColumnFamilies, Slice(keybuf)); - std::vector values; - std::vector statuses = - db->MultiGet(ReadOptions(), t->state->test->handles_, keys, &values); - Status s = statuses[0]; - // all statuses have to be the same - for (size_t i = 1; i < statuses.size(); ++i) { - // they are either both ok or both not-found - ASSERT_TRUE((s.ok() && statuses[i].ok()) || - (s.IsNotFound() && statuses[i].IsNotFound())); - } - if (s.IsNotFound()) { - // Key has not yet been written - } else { - // Check that the writer thread counter is >= the counter in the value - ASSERT_OK(s); - int unique_id = -1; - for (int i = 0; i < kColumnFamilies; ++i) { - int k, w, c, cf, u; - ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, - &c, &cf, &u)) - << values[i]; - ASSERT_EQ(k, key); - ASSERT_GE(w, 0); - ASSERT_LT(w, kNumThreads); - ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire)); - ASSERT_EQ(cf, i); - if (i == 0) { - unique_id = u; - } else { - // this checks that updates across column families happened - // atomically -- all unique ids are the same - ASSERT_EQ(u, unique_id); - } - } - } - } - counter++; - } - t->state->thread_done[id].store(true, std::memory_order_release); - fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); -} - -} // namespace - -class MultiThreadedDBTest : public DBTest, - public ::testing::WithParamInterface { - public: - virtual void SetUp() override { option_config_ = GetParam(); } - - static std::vector GenerateOptionConfigs() { - std::vector optionConfigs; - for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) { - // skip as HashCuckooRep does not support snapshot - if (optionConfig != kHashCuckoo) { - optionConfigs.push_back(optionConfig); - } - } - return optionConfigs; - } -}; - -TEST_P(MultiThreadedDBTest, MultiThreaded) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - std::vector cfs; - for (int i = 1; i < kColumnFamilies; ++i) { - cfs.push_back(ToString(i)); - } - CreateAndReopenWithCF(cfs, CurrentOptions(options_override)); - // Initialize state - MTState mt; - mt.test = this; - mt.stop.store(false, std::memory_order_release); - for (int id = 0; id < kNumThreads; id++) { - mt.counter[id].store(0, std::memory_order_release); - mt.thread_done[id].store(false, std::memory_order_release); - } - - // Start threads - MTThread thread[kNumThreads]; - for (int id = 0; id < kNumThreads; id++) { - thread[id].state = &mt; - thread[id].id = id; - env_->StartThread(MTThreadBody, &thread[id]); - } - - // Let them run for a while - env_->SleepForMicroseconds(kTestSeconds * 1000000); - - // Stop the threads and wait for them to finish - mt.stop.store(true, std::memory_order_release); - for (int id = 0; id < kNumThreads; id++) { - while (mt.thread_done[id].load(std::memory_order_acquire) == false) { - env_->SleepForMicroseconds(100000); - } - } -} - -INSTANTIATE_TEST_CASE_P( - MultiThreaded, MultiThreadedDBTest, - ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs())); -#endif // ROCKSDB_LITE - -// Group commit test: -namespace { - -static const int kGCNumThreads = 4; -static const int kGCNumKeys = 1000; - -struct GCThread { - DB* db; - int id; - std::atomic done; -}; - -static void GCThreadBody(void* arg) { - GCThread* t = reinterpret_cast(arg); - int id = t->id; - DB* db = t->db; - WriteOptions wo; - - for (int i = 0; i < kGCNumKeys; ++i) { - std::string kv(ToString(i + id * kGCNumKeys)); - ASSERT_OK(db->Put(wo, kv, kv)); - } - t->done = true; -} - -} // namespace - -TEST_F(DBTest, GroupCommitTest) { - do { - Options options = CurrentOptions(); - options.env = env_; - env_->log_write_slowdown_.store(100); - options.statistics = rocksdb::CreateDBStatistics(); - Reopen(options); - - // Start threads - GCThread thread[kGCNumThreads]; - for (int id = 0; id < kGCNumThreads; id++) { - thread[id].id = id; - thread[id].db = db_; - thread[id].done = false; - env_->StartThread(GCThreadBody, &thread[id]); - } - - for (int id = 0; id < kGCNumThreads; id++) { - while (thread[id].done == false) { - env_->SleepForMicroseconds(100000); - } - } - env_->log_write_slowdown_.store(0); - - ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0); - - std::vector expected_db; - for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) { - expected_db.push_back(ToString(i)); - } - sort(expected_db.begin(), expected_db.end()); - - Iterator* itr = db_->NewIterator(ReadOptions()); - itr->SeekToFirst(); - for (auto x : expected_db) { - ASSERT_TRUE(itr->Valid()); - ASSERT_EQ(itr->key().ToString(), x); - ASSERT_EQ(itr->value().ToString(), x); - itr->Next(); - } - ASSERT_TRUE(!itr->Valid()); - delete itr; - - HistogramData hist_data = {0, 0, 0, 0, 0}; - options.statistics->histogramData(DB_WRITE, &hist_data); - ASSERT_GT(hist_data.average, 0.0); - } while (ChangeOptions(kSkipNoSeekToLast)); -} - -namespace { -typedef std::map KVMap; -} - -class ModelDB: public DB { - public: - class ModelSnapshot : public Snapshot { - public: - KVMap map_; - - virtual SequenceNumber GetSequenceNumber() const override { - // no need to call this - assert(false); - return 0; - } - }; - - explicit ModelDB(const Options& options) : options_(options) {} - using DB::Put; - virtual Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, - const Slice& k, const Slice& v) override { - WriteBatch batch; - batch.Put(cf, k, v); - return Write(o, &batch); - } - using DB::Delete; - virtual Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf, - const Slice& key) override { - WriteBatch batch; - batch.Delete(cf, key); - return Write(o, &batch); - } - using DB::SingleDelete; - virtual Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf, - const Slice& key) override { - WriteBatch batch; - batch.SingleDelete(cf, key); - return Write(o, &batch); - } - using DB::Merge; - virtual Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, - const Slice& k, const Slice& v) override { - WriteBatch batch; - batch.Merge(cf, k, v); - return Write(o, &batch); - } - using DB::Get; - virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* cf, - const Slice& key, std::string* value) override { - return Status::NotSupported(key); - } - - using DB::MultiGet; - virtual std::vector MultiGet( - const ReadOptions& options, - const std::vector& column_family, - const std::vector& keys, - std::vector* values) override { - std::vector s(keys.size(), - Status::NotSupported("Not implemented.")); - return s; - } - -#ifndef ROCKSDB_LITE - using DB::AddFile; - virtual Status AddFile(ColumnFamilyHandle* column_family, - const ExternalSstFileInfo* file_path, - bool move_file) override { - return Status::NotSupported("Not implemented."); - } - virtual Status AddFile(ColumnFamilyHandle* column_family, - const std::string& file_path, - bool move_file) override { - return Status::NotSupported("Not implemented."); - } - - using DB::GetPropertiesOfAllTables; - virtual Status GetPropertiesOfAllTables( - ColumnFamilyHandle* column_family, - TablePropertiesCollection* props) override { - return Status(); - } - - virtual Status GetPropertiesOfTablesInRange( - ColumnFamilyHandle* column_family, const Range* range, std::size_t n, - TablePropertiesCollection* props) override { - return Status(); - } -#endif // ROCKSDB_LITE - - using DB::KeyMayExist; - virtual bool KeyMayExist(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - std::string* value, - bool* value_found = nullptr) override { - if (value_found != nullptr) { - *value_found = false; - } - return true; // Not Supported directly - } - using DB::NewIterator; - virtual Iterator* NewIterator(const ReadOptions& options, - ColumnFamilyHandle* column_family) override { - if (options.snapshot == nullptr) { - KVMap* saved = new KVMap; - *saved = map_; - return new ModelIter(saved, true); - } else { - const KVMap* snapshot_state = - &(reinterpret_cast(options.snapshot)->map_); - return new ModelIter(snapshot_state, false); - } - } - virtual Status NewIterators( - const ReadOptions& options, - const std::vector& column_family, - std::vector* iterators) override { - return Status::NotSupported("Not supported yet"); - } - virtual const Snapshot* GetSnapshot() override { - ModelSnapshot* snapshot = new ModelSnapshot; - snapshot->map_ = map_; - return snapshot; - } - - virtual void ReleaseSnapshot(const Snapshot* snapshot) override { - delete reinterpret_cast(snapshot); - } - - virtual Status Write(const WriteOptions& options, - WriteBatch* batch) override { - class Handler : public WriteBatch::Handler { - public: - KVMap* map_; - virtual void Put(const Slice& key, const Slice& value) override { - (*map_)[key.ToString()] = value.ToString(); - } - virtual void Merge(const Slice& key, const Slice& value) override { - // ignore merge for now - //(*map_)[key.ToString()] = value.ToString(); - } - virtual void Delete(const Slice& key) override { - map_->erase(key.ToString()); - } - }; - Handler handler; - handler.map_ = &map_; - return batch->Iterate(&handler); - } - - using DB::GetProperty; - virtual bool GetProperty(ColumnFamilyHandle* column_family, - const Slice& property, std::string* value) override { - return false; - } - using DB::GetIntProperty; - virtual bool GetIntProperty(ColumnFamilyHandle* column_family, - const Slice& property, uint64_t* value) override { - return false; - } - using DB::GetAggregatedIntProperty; - virtual bool GetAggregatedIntProperty(const Slice& property, - uint64_t* value) override { - return false; - } - using DB::GetApproximateSizes; - virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, - const Range* range, int n, uint64_t* sizes, - bool include_memtable) override { - for (int i = 0; i < n; i++) { - sizes[i] = 0; - } - } - using DB::CompactRange; - virtual Status CompactRange(const CompactRangeOptions& options, - ColumnFamilyHandle* column_family, - const Slice* start, const Slice* end) override { - return Status::NotSupported("Not supported operation."); - } - - using DB::CompactFiles; - virtual Status CompactFiles( - const CompactionOptions& compact_options, - ColumnFamilyHandle* column_family, - const std::vector& input_file_names, - const int output_level, const int output_path_id = -1) override { - return Status::NotSupported("Not supported operation."); - } - - Status PauseBackgroundWork() override { - return Status::NotSupported("Not supported operation."); - } - - Status ContinueBackgroundWork() override { - return Status::NotSupported("Not supported operation."); - } - - Status EnableAutoCompaction( - const std::vector& column_family_handles) override { - return Status::NotSupported("Not supported operation."); - } - - using DB::NumberLevels; - virtual int NumberLevels(ColumnFamilyHandle* column_family) override { - return 1; - } - - using DB::MaxMemCompactionLevel; - virtual int MaxMemCompactionLevel( - ColumnFamilyHandle* column_family) override { - return 1; - } - - using DB::Level0StopWriteTrigger; - virtual int Level0StopWriteTrigger( - ColumnFamilyHandle* column_family) override { - return -1; - } - - virtual const std::string& GetName() const override { return name_; } - - virtual Env* GetEnv() const override { return nullptr; } - - using DB::GetOptions; - virtual const Options& GetOptions( - ColumnFamilyHandle* column_family) const override { - return options_; - } - - using DB::GetDBOptions; - virtual const DBOptions& GetDBOptions() const override { return options_; } - - using DB::Flush; - virtual Status Flush(const rocksdb::FlushOptions& options, - ColumnFamilyHandle* column_family) override { - Status ret; - return ret; - } - - virtual Status SyncWAL() override { - return Status::OK(); - } - -#ifndef ROCKSDB_LITE - virtual Status DisableFileDeletions() override { return Status::OK(); } - - virtual Status EnableFileDeletions(bool force) override { - return Status::OK(); - } - virtual Status GetLiveFiles(std::vector&, uint64_t* size, - bool flush_memtable = true) override { - return Status::OK(); - } - - virtual Status GetSortedWalFiles(VectorLogPtr& files) override { - return Status::OK(); - } - - virtual Status DeleteFile(std::string name) override { return Status::OK(); } - - virtual Status GetUpdatesSince( - rocksdb::SequenceNumber, unique_ptr*, - const TransactionLogIterator::ReadOptions& - read_options = TransactionLogIterator::ReadOptions()) override { - return Status::NotSupported("Not supported in Model DB"); - } - - virtual void GetColumnFamilyMetaData( - ColumnFamilyHandle* column_family, - ColumnFamilyMetaData* metadata) override {} -#endif // ROCKSDB_LITE - - virtual Status GetDbIdentity(std::string& identity) const override { - return Status::OK(); - } - - virtual SequenceNumber GetLatestSequenceNumber() const override { return 0; } - - virtual ColumnFamilyHandle* DefaultColumnFamily() const override { - return nullptr; - } - - private: - class ModelIter: public Iterator { - public: - ModelIter(const KVMap* map, bool owned) - : map_(map), owned_(owned), iter_(map_->end()) { - } - ~ModelIter() { - if (owned_) delete map_; - } - virtual bool Valid() const override { return iter_ != map_->end(); } - virtual void SeekToFirst() override { iter_ = map_->begin(); } - virtual void SeekToLast() override { - if (map_->empty()) { - iter_ = map_->end(); - } else { - iter_ = map_->find(map_->rbegin()->first); - } - } - virtual void Seek(const Slice& k) override { - iter_ = map_->lower_bound(k.ToString()); - } - virtual void Next() override { ++iter_; } - virtual void Prev() override { - if (iter_ == map_->begin()) { - iter_ = map_->end(); - return; - } - --iter_; - } - - virtual Slice key() const override { return iter_->first; } - virtual Slice value() const override { return iter_->second; } - virtual Status status() const override { return Status::OK(); } - - private: - const KVMap* const map_; - const bool owned_; // Do we own map_ - KVMap::const_iterator iter_; - }; - const Options options_; - KVMap map_; - std::string name_ = ""; -}; - -static std::string RandomKey(Random* rnd, int minimum = 0) { - int len; - do { - len = (rnd->OneIn(3) - ? 1 // Short sometimes to encourage collisions - : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); - } while (len < minimum); - return test::RandomKey(rnd, len); -} - -static bool CompareIterators(int step, - DB* model, - DB* db, - const Snapshot* model_snap, - const Snapshot* db_snap) { - ReadOptions options; - options.snapshot = model_snap; - Iterator* miter = model->NewIterator(options); - options.snapshot = db_snap; - Iterator* dbiter = db->NewIterator(options); - bool ok = true; - int count = 0; - for (miter->SeekToFirst(), dbiter->SeekToFirst(); - ok && miter->Valid() && dbiter->Valid(); - miter->Next(), dbiter->Next()) { - count++; - if (miter->key().compare(dbiter->key()) != 0) { - fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", - step, - EscapeString(miter->key()).c_str(), - EscapeString(dbiter->key()).c_str()); - ok = false; - break; - } - - if (miter->value().compare(dbiter->value()) != 0) { - fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", - step, - EscapeString(miter->key()).c_str(), - EscapeString(miter->value()).c_str(), - EscapeString(miter->value()).c_str()); - ok = false; - } - } - - if (ok) { - if (miter->Valid() != dbiter->Valid()) { - fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", - step, miter->Valid(), dbiter->Valid()); - ok = false; - } - } - delete miter; - delete dbiter; - return ok; -} - -class DBTestRandomized : public DBTest, - public ::testing::WithParamInterface { - public: - virtual void SetUp() override { option_config_ = GetParam(); } - - static std::vector GenerateOptionConfigs() { - std::vector option_configs; - // skip cuckoo hash as it does not support snapshot. - for (int option_config = kDefault; option_config < kEnd; ++option_config) { - if (!ShouldSkipOptions(option_config, kSkipDeletesFilterFirst | - kSkipNoSeekToLast | - kSkipHashCuckoo)) { - option_configs.push_back(option_config); - } - } - option_configs.push_back(kBlockBasedTableWithIndexRestartInterval); - return option_configs; - } -}; - -INSTANTIATE_TEST_CASE_P( - DBTestRandomized, DBTestRandomized, - ::testing::ValuesIn(DBTestRandomized::GenerateOptionConfigs())); - -TEST_P(DBTestRandomized, Randomized) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - Options options = CurrentOptions(options_override); - DestroyAndReopen(options); - - Random rnd(test::RandomSeed() + GetParam()); - ModelDB model(options); - const int N = 10000; - const Snapshot* model_snap = nullptr; - const Snapshot* db_snap = nullptr; - std::string k, v; - for (int step = 0; step < N; step++) { - // TODO(sanjay): Test Get() works - int p = rnd.Uniform(100); - int minimum = 0; - if (option_config_ == kHashSkipList || - option_config_ == kHashLinkList || - option_config_ == kHashCuckoo || - option_config_ == kPlainTableFirstBytePrefix || - option_config_ == kBlockBasedTableWithWholeKeyHashIndex || - option_config_ == kBlockBasedTableWithPrefixHashIndex) { - minimum = 1; - } - if (p < 45) { // Put - k = RandomKey(&rnd, minimum); - v = RandomString(&rnd, - rnd.OneIn(20) - ? 100 + rnd.Uniform(100) - : rnd.Uniform(8)); - ASSERT_OK(model.Put(WriteOptions(), k, v)); - ASSERT_OK(db_->Put(WriteOptions(), k, v)); - } else if (p < 90) { // Delete - k = RandomKey(&rnd, minimum); - ASSERT_OK(model.Delete(WriteOptions(), k)); - ASSERT_OK(db_->Delete(WriteOptions(), k)); - } else { // Multi-element batch - WriteBatch b; - const int num = rnd.Uniform(8); - for (int i = 0; i < num; i++) { - if (i == 0 || !rnd.OneIn(10)) { - k = RandomKey(&rnd, minimum); - } else { - // Periodically re-use the same key from the previous iter, so - // we have multiple entries in the write batch for the same key - } - if (rnd.OneIn(2)) { - v = RandomString(&rnd, rnd.Uniform(10)); - b.Put(k, v); - } else { - b.Delete(k); - } - } - ASSERT_OK(model.Write(WriteOptions(), &b)); - ASSERT_OK(db_->Write(WriteOptions(), &b)); - } - - if ((step % 100) == 0) { - // For DB instances that use the hash index + block-based table, the - // iterator will be invalid right when seeking a non-existent key, right - // than return a key that is close to it. - if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex && - option_config_ != kBlockBasedTableWithPrefixHashIndex) { - ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); - ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); - } - - // Save a snapshot from each DB this time that we'll use next - // time we compare things, to make sure the current state is - // preserved with the snapshot - if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); - if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); - - Reopen(options); - ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); - - model_snap = model.GetSnapshot(); - db_snap = db_->GetSnapshot(); - } - } - if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); - if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); -} - -TEST_F(DBTest, MultiGetSimple) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "k1", "v1")); - ASSERT_OK(Put(1, "k2", "v2")); - ASSERT_OK(Put(1, "k3", "v3")); - ASSERT_OK(Put(1, "k4", "v4")); - ASSERT_OK(Delete(1, "k4")); - ASSERT_OK(Put(1, "k5", "v5")); - ASSERT_OK(Delete(1, "no_key")); - - std::vector keys({"k1", "k2", "k3", "k4", "k5", "no_key"}); - - std::vector values(20, "Temporary data to be overwritten"); - std::vector cfs(keys.size(), handles_[1]); - - std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); - ASSERT_EQ(values.size(), keys.size()); - ASSERT_EQ(values[0], "v1"); - ASSERT_EQ(values[1], "v2"); - ASSERT_EQ(values[2], "v3"); - ASSERT_EQ(values[4], "v5"); - - ASSERT_OK(s[0]); - ASSERT_OK(s[1]); - ASSERT_OK(s[2]); - ASSERT_TRUE(s[3].IsNotFound()); - ASSERT_OK(s[4]); - ASSERT_TRUE(s[5].IsNotFound()); - } while (ChangeCompactOptions()); -} - -TEST_F(DBTest, MultiGetEmpty) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - // Empty Key Set - std::vector keys; - std::vector values; - std::vector cfs; - std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); - ASSERT_EQ(s.size(), 0U); - - // Empty Database, Empty Key Set - Options options = CurrentOptions(); - options.create_if_missing = true; - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - s = db_->MultiGet(ReadOptions(), cfs, keys, &values); - ASSERT_EQ(s.size(), 0U); - - // Empty Database, Search for Keys - keys.resize(2); - keys[0] = "a"; - keys[1] = "b"; - cfs.push_back(handles_[0]); - cfs.push_back(handles_[1]); - s = db_->MultiGet(ReadOptions(), cfs, keys, &values); - ASSERT_EQ((int)s.size(), 2); - ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound()); - } while (ChangeCompactOptions()); -} - -#ifndef ROCKSDB_LITE -namespace { -void PrefixScanInit(DBTest *dbtest) { - char buf[100]; - std::string keystr; - const int small_range_sstfiles = 5; - const int big_range_sstfiles = 5; - - // Generate 11 sst files with the following prefix ranges. - // GROUP 0: [0,10] (level 1) - // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0) - // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0) - // - // A seek with the previous API would do 11 random I/Os (to all the - // files). With the new API and a prefix filter enabled, we should - // only do 2 random I/O, to the 2 files containing the key. - - // GROUP 0 - snprintf(buf, sizeof(buf), "%02d______:start", 0); - keystr = std::string(buf); - ASSERT_OK(dbtest->Put(keystr, keystr)); - snprintf(buf, sizeof(buf), "%02d______:end", 10); - keystr = std::string(buf); - ASSERT_OK(dbtest->Put(keystr, keystr)); - dbtest->Flush(); - dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr, - nullptr); // move to level 1 - - // GROUP 1 - for (int i = 1; i <= small_range_sstfiles; i++) { - snprintf(buf, sizeof(buf), "%02d______:start", i); - keystr = std::string(buf); - ASSERT_OK(dbtest->Put(keystr, keystr)); - snprintf(buf, sizeof(buf), "%02d______:end", i+1); - keystr = std::string(buf); - ASSERT_OK(dbtest->Put(keystr, keystr)); - dbtest->Flush(); - } - - // GROUP 2 - for (int i = 1; i <= big_range_sstfiles; i++) { - snprintf(buf, sizeof(buf), "%02d______:start", 0); - keystr = std::string(buf); - ASSERT_OK(dbtest->Put(keystr, keystr)); - snprintf(buf, sizeof(buf), "%02d______:end", - small_range_sstfiles+i+1); - keystr = std::string(buf); - ASSERT_OK(dbtest->Put(keystr, keystr)); - dbtest->Flush(); - } -} -} // namespace - -TEST_F(DBTest, PrefixScan) { - XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, - kSkipNoPrefix); - while (ChangeFilterOptions()) { - int count; - Slice prefix; - Slice key; - char buf[100]; - Iterator* iter; - snprintf(buf, sizeof(buf), "03______:"); - prefix = Slice(buf, 8); - key = Slice(buf, 9); - ASSERT_EQ(key.difference_offset(prefix), 8); - ASSERT_EQ(prefix.difference_offset(key), 8); - // db configs - env_->count_random_reads_ = true; - Options options = CurrentOptions(); - options.env = env_; - options.prefix_extractor.reset(NewFixedPrefixTransform(8)); - options.disable_auto_compactions = true; - options.max_background_compactions = 2; - options.create_if_missing = true; - options.memtable_factory.reset(NewHashSkipListRepFactory(16)); - - BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.filter_policy.reset(NewBloomFilterPolicy(10)); - table_options.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - // 11 RAND I/Os - DestroyAndReopen(options); - PrefixScanInit(this); - count = 0; - env_->random_read_counter_.Reset(); - iter = db_->NewIterator(ReadOptions()); - for (iter->Seek(prefix); iter->Valid(); iter->Next()) { - if (! iter->key().starts_with(prefix)) { - break; - } - count++; - } - ASSERT_OK(iter->status()); - delete iter; - ASSERT_EQ(count, 2); - ASSERT_EQ(env_->random_read_counter_.Read(), 2); - Close(); - } // end of while - XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, 0); -} -#endif // ROCKSDB_LITE - -TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { - // create a DB with block prefix index - BlockBasedTableOptions table_options; - Options options = CurrentOptions(); - table_options.index_type = BlockBasedTableOptions::kHashSearch; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - - - Reopen(options); - ASSERT_OK(Put("k1", "v1")); - Flush(); - ASSERT_OK(Put("k2", "v2")); - - // Reopen it without prefix extractor, make sure everything still works. - // RocksDB should just fall back to the binary index. - table_options.index_type = BlockBasedTableOptions::kBinarySearch; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.prefix_extractor.reset(); - - Reopen(options); - ASSERT_EQ("v1", Get("k1")); - ASSERT_EQ("v2", Get("k2")); -} - -TEST_F(DBTest, ChecksumTest) { - BlockBasedTableOptions table_options; - Options options = CurrentOptions(); - - table_options.checksum = kCRC32c; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(options); - ASSERT_OK(Put("a", "b")); - ASSERT_OK(Put("c", "d")); - ASSERT_OK(Flush()); // table with crc checksum - - table_options.checksum = kxxHash; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(options); - ASSERT_OK(Put("e", "f")); - ASSERT_OK(Put("g", "h")); - ASSERT_OK(Flush()); // table with xxhash checksum - - table_options.checksum = kCRC32c; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(options); - ASSERT_EQ("b", Get("a")); - ASSERT_EQ("d", Get("c")); - ASSERT_EQ("f", Get("e")); - ASSERT_EQ("h", Get("g")); - - table_options.checksum = kCRC32c; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(options); - ASSERT_EQ("b", Get("a")); - ASSERT_EQ("d", Get("c")); - ASSERT_EQ("f", Get("e")); - ASSERT_EQ("h", Get("g")); -} - -#ifndef ROCKSDB_LITE -TEST_P(DBTestWithParam, FIFOCompactionTest) { - for (int iter = 0; iter < 2; ++iter) { - // first iteration -- auto compaction - // second iteration -- manual compaction - Options options; - options.compaction_style = kCompactionStyleFIFO; - options.write_buffer_size = 100 << 10; // 100KB - options.arena_block_size = 4096; - options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB - options.compression = kNoCompression; - options.create_if_missing = true; - options.max_subcompactions = max_subcompactions_; - if (iter == 1) { - options.disable_auto_compactions = true; - } - options = CurrentOptions(options); - DestroyAndReopen(options); - - Random rnd(301); - for (int i = 0; i < 6; ++i) { - for (int j = 0; j < 110; ++j) { - ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 980))); - } - // flush should happen here - ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); - } - if (iter == 0) { - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - } else { - CompactRangeOptions cro; - cro.exclusive_manual_compaction = exclusive_manual_compaction_; - ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - } - // only 5 files should survive - ASSERT_EQ(NumTableFilesAtLevel(0), 5); - for (int i = 0; i < 50; ++i) { - // these keys should be deleted in previous compaction - ASSERT_EQ("NOT_FOUND", Get(ToString(i))); - } - } -} -#endif // ROCKSDB_LITE - -// verify that we correctly deprecated timeout_hint_us -TEST_F(DBTest, SimpleWriteTimeoutTest) { - WriteOptions write_opt; - write_opt.timeout_hint_us = 0; - ASSERT_OK(Put(Key(1), Key(1) + std::string(100, 'v'), write_opt)); - write_opt.timeout_hint_us = 10; - ASSERT_NOK(Put(Key(1), Key(1) + std::string(100, 'v'), write_opt)); -} - -#ifndef ROCKSDB_LITE -/* - * This test is not reliable enough as it heavily depends on disk behavior. - */ -TEST_F(DBTest, RateLimitingTest) { - Options options = CurrentOptions(); - options.write_buffer_size = 1 << 20; // 1MB - options.level0_file_num_compaction_trigger = 2; - options.target_file_size_base = 1 << 20; // 1MB - options.max_bytes_for_level_base = 4 << 20; // 4MB - options.max_bytes_for_level_multiplier = 4; - options.compression = kNoCompression; - options.create_if_missing = true; - options.env = env_; - options.IncreaseParallelism(4); - DestroyAndReopen(options); - - WriteOptions wo; - wo.disableWAL = true; - - // # no rate limiting - Random rnd(301); - uint64_t start = env_->NowMicros(); - // Write ~96M data - for (int64_t i = 0; i < (96 << 10); ++i) { - ASSERT_OK(Put(RandomString(&rnd, 32), - RandomString(&rnd, (1 << 10) + 1), wo)); - } - uint64_t elapsed = env_->NowMicros() - start; - double raw_rate = env_->bytes_written_ * 1000000.0 / elapsed; - Close(); - - // # rate limiting with 0.7 x threshold - options.rate_limiter.reset( - NewGenericRateLimiter(static_cast(0.7 * raw_rate))); - env_->bytes_written_ = 0; - DestroyAndReopen(options); - - start = env_->NowMicros(); - // Write ~96M data - for (int64_t i = 0; i < (96 << 10); ++i) { - ASSERT_OK(Put(RandomString(&rnd, 32), - RandomString(&rnd, (1 << 10) + 1), wo)); - } - elapsed = env_->NowMicros() - start; - Close(); - ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); - double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; - fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio); - ASSERT_TRUE(ratio < 0.8); - - // # rate limiting with half of the raw_rate - options.rate_limiter.reset( - NewGenericRateLimiter(static_cast(raw_rate / 2))); - env_->bytes_written_ = 0; - DestroyAndReopen(options); - - start = env_->NowMicros(); - // Write ~96M data - for (int64_t i = 0; i < (96 << 10); ++i) { - ASSERT_OK(Put(RandomString(&rnd, 32), - RandomString(&rnd, (1 << 10) + 1), wo)); - } - elapsed = env_->NowMicros() - start; - Close(); - ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); - ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; - fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio); - ASSERT_LT(ratio, 0.6); -} - -TEST_F(DBTest, TableOptionsSanitizeTest) { - Options options = CurrentOptions(); - options.create_if_missing = true; - DestroyAndReopen(options); - ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false); - - options.table_factory.reset(new PlainTableFactory()); - options.prefix_extractor.reset(NewNoopTransform()); - Destroy(options); - ASSERT_TRUE(!TryReopen(options).IsNotSupported()); - - // Test for check of prefix_extractor when hash index is used for - // block-based table - BlockBasedTableOptions to; - to.index_type = BlockBasedTableOptions::kHashSearch; - options = CurrentOptions(); - options.create_if_missing = true; - options.table_factory.reset(NewBlockBasedTableFactory(to)); - ASSERT_TRUE(TryReopen(options).IsInvalidArgument()); - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - ASSERT_OK(TryReopen(options)); -} - -TEST_F(DBTest, MmapAndBufferOptions) { - Options options = CurrentOptions(); - - // If allow_mmap_reads is on allow_os_buffer must also be on - // On Windows you can have either memory mapped file or a file - // with unbuffered access. -#ifndef OS_WIN - options.allow_os_buffer = false; -#endif - options.allow_mmap_reads = true; - ASSERT_NOK(TryReopen(options)); - - // All other combinations are acceptable - options.allow_os_buffer = true; - ASSERT_OK(TryReopen(options)); - - options.allow_os_buffer = false; - options.allow_mmap_reads = false; - ASSERT_OK(TryReopen(options)); - - options.allow_os_buffer = true; - ASSERT_OK(TryReopen(options)); -} - -TEST_F(DBTest, ConcurrentMemtableNotSupported) { - Options options = CurrentOptions(); - options.allow_concurrent_memtable_write = true; - options.soft_pending_compaction_bytes_limit = 0; - options.hard_pending_compaction_bytes_limit = 100; - options.create_if_missing = true; - - DestroyDB(dbname_, options); - options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 4)); - ASSERT_NOK(TryReopen(options)); - - options.memtable_factory.reset(new SkipListFactory); - ASSERT_OK(TryReopen(options)); - - ColumnFamilyOptions cf_options(options); - cf_options.memtable_factory.reset( - NewHashLinkListRepFactory(4, 0, 3, true, 4)); - ColumnFamilyHandle* handle; - ASSERT_NOK(db_->CreateColumnFamily(cf_options, "name", &handle)); -} - -#endif // ROCKSDB_LITE - -TEST_F(DBTest, SanitizeNumThreads) { - for (int attempt = 0; attempt < 2; attempt++) { - const size_t kTotalTasks = 8; - test::SleepingBackgroundTask sleeping_tasks[kTotalTasks]; - - Options options = CurrentOptions(); - if (attempt == 0) { - options.max_background_compactions = 3; - options.max_background_flushes = 2; - } - options.create_if_missing = true; - DestroyAndReopen(options); - - for (size_t i = 0; i < kTotalTasks; i++) { - // Insert 5 tasks to low priority queue and 5 tasks to high priority queue - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_tasks[i], - (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH); - } - - // Wait 100 milliseconds for they are scheduled. - env_->SleepForMicroseconds(100000); - - // pool size 3, total task 4. Queue size should be 1. - ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW)); - // pool size 2, total task 4. Queue size should be 2. - ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH)); - - for (size_t i = 0; i < kTotalTasks; i++) { - sleeping_tasks[i].WakeUp(); - sleeping_tasks[i].WaitUntilDone(); - } - - ASSERT_OK(Put("abc", "def")); - ASSERT_EQ("def", Get("abc")); - Flush(); - ASSERT_EQ("def", Get("abc")); - } -} - -TEST_F(DBTest, DBIteratorBoundTest) { - Options options = CurrentOptions(); - options.env = env_; - options.create_if_missing = true; - - options.prefix_extractor = nullptr; - DestroyAndReopen(options); - ASSERT_OK(Put("a", "0")); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("foo1", "bar1")); - ASSERT_OK(Put("g1", "0")); - - // testing basic case with no iterate_upper_bound and no prefix_extractor - { - ReadOptions ro; - ro.iterate_upper_bound = nullptr; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->Seek("foo"); - - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("foo")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("g1")), 0); - } - - // testing iterate_upper_bound and forward iterator - // to make sure it stops at bound - { - ReadOptions ro; - // iterate_upper_bound points beyond the last expected entry - Slice prefix("foo2"); - ro.iterate_upper_bound = &prefix; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->Seek("foo"); - - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("foo")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(("foo1")), 0); - - iter->Next(); - // should stop here... - ASSERT_TRUE(!iter->Valid()); - } - // Testing SeekToLast with iterate_upper_bound set - { - ReadOptions ro; - - Slice prefix("foo"); - ro.iterate_upper_bound = &prefix; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->SeekToLast(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("a")), 0); - } - - // prefix is the first letter of the key - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - - DestroyAndReopen(options); - ASSERT_OK(Put("a", "0")); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("foo1", "bar1")); - ASSERT_OK(Put("g1", "0")); - - // testing with iterate_upper_bound and prefix_extractor - // Seek target and iterate_upper_bound are not is same prefix - // This should be an error - { - ReadOptions ro; - Slice upper_bound("g"); - ro.iterate_upper_bound = &upper_bound; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->Seek("foo"); - - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo", iter->key().ToString()); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo1", iter->key().ToString()); - - iter->Next(); - ASSERT_TRUE(!iter->Valid()); - } - - // testing that iterate_upper_bound prevents iterating over deleted items - // if the bound has already reached - { - options.prefix_extractor = nullptr; - DestroyAndReopen(options); - ASSERT_OK(Put("a", "0")); - ASSERT_OK(Put("b", "0")); - ASSERT_OK(Put("b1", "0")); - ASSERT_OK(Put("c", "0")); - ASSERT_OK(Put("d", "0")); - ASSERT_OK(Put("e", "0")); - ASSERT_OK(Delete("c")); - ASSERT_OK(Delete("d")); - - // base case with no bound - ReadOptions ro; - ro.iterate_upper_bound = nullptr; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->Seek("b"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("b")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(("b1")), 0); - - perf_context.Reset(); - iter->Next(); - - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 2); - - // now testing with iterate_bound - Slice prefix("c"); - ro.iterate_upper_bound = &prefix; - - iter.reset(db_->NewIterator(ro)); - - perf_context.Reset(); - - iter->Seek("b"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("b")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(("b1")), 0); - - iter->Next(); - // the iteration should stop as soon as the the bound key is reached - // even though the key is deleted - // hence internal_delete_skipped_count should be 0 - ASSERT_TRUE(!iter->Valid()); - ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 0); - } -} - -TEST_F(DBTest, WriteSingleThreadEntry) { - std::vector threads; - dbfull()->TEST_LockMutex(); - auto w = dbfull()->TEST_BeginWrite(); - threads.emplace_back([&] { Put("a", "b"); }); - env_->SleepForMicroseconds(10000); - threads.emplace_back([&] { Flush(); }); - env_->SleepForMicroseconds(10000); - dbfull()->TEST_UnlockMutex(); - dbfull()->TEST_LockMutex(); - dbfull()->TEST_EndWrite(w); - dbfull()->TEST_UnlockMutex(); - - for (auto& t : threads) { - t.join(); - } -} - -TEST_F(DBTest, DisableDataSyncTest) { - env_->sync_counter_.store(0); - // iter 0 -- no sync - // iter 1 -- sync - for (int iter = 0; iter < 2; ++iter) { - Options options = CurrentOptions(); - options.disableDataSync = iter == 0; - options.create_if_missing = true; - options.num_levels = 10; - options.env = env_; - Reopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - - MakeTables(10, "a", "z"); - Compact("a", "z"); + uint64_t size; + ASSERT_OK(env_->GetFileSize(src, &size)); - if (iter == 0) { - ASSERT_EQ(env_->sync_counter_.load(), 0); - } else { - ASSERT_GT(env_->sync_counter_.load(), 0); + // record the number and the size of the + // latest manifest file + if (ParseFileName(files[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + if (number > manifest_number) { + manifest_number = number; + ASSERT_GE(size, manifest_size); + size = manifest_size; // copy only valid MANIFEST data + } + } + } + CopyFile(src, dest, size); } - Destroy(options); - } -} -#ifndef ROCKSDB_LITE -TEST_F(DBTest, DynamicMemtableOptions) { - const uint64_t k64KB = 1 << 16; - const uint64_t k128KB = 1 << 17; - const uint64_t k5KB = 5 * 1024; - const int kNumPutsBeforeWaitForFlush = 64; - Options options; - options.env = env_; - options.create_if_missing = true; - options.compression = kNoCompression; - options.max_background_compactions = 1; - options.write_buffer_size = k64KB; - options.arena_block_size = 16 * 1024; - options.max_write_buffer_number = 2; - // Don't trigger compact/slowdown/stop - options.level0_file_num_compaction_trigger = 1024; - options.level0_slowdown_writes_trigger = 1024; - options.level0_stop_writes_trigger = 1024; - DestroyAndReopen(options); + // release file snapshot + dbfull()->DisableFileDeletions(); + // overwrite one key, this key should not appear in the snapshot + std::vector extras; + for (unsigned int i = 0; i < 1; i++) { + extras.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(0, Key(i), extras[i])); + } - auto gen_l0_kb = [this, kNumPutsBeforeWaitForFlush](int size) { - Random rnd(301); - for (int i = 0; i < size; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + // verify that data in the snapshot are correct + std::vector column_families; + column_families.emplace_back("default", ColumnFamilyOptions()); + column_families.emplace_back("pikachu", ColumnFamilyOptions()); + std::vector cf_handles; + DB* snapdb; + DBOptions opts; + opts.env = env_; + opts.create_if_missing = false; + Status stat = + DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb); + ASSERT_OK(stat); - // The following condition prevents a race condition between flush jobs - // acquiring work and this thread filling up multiple memtables. Without - // this, the flush might produce less files than expected because - // multiple memtables are flushed into a single L0 file. This race - // condition affects assertion (A). - if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) { - dbfull()->TEST_WaitForFlushMemTable(); - } + ReadOptions roptions; + std::string val; + for (unsigned int i = 0; i < 80; i++) { + stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val); + ASSERT_EQ(values[i].compare(val), 0); } - dbfull()->TEST_WaitForFlushMemTable(); - }; + for (auto cfh : cf_handles) { + delete cfh; + } + delete snapdb; - // Test write_buffer_size - gen_l0_kb(64); - ASSERT_EQ(NumTableFilesAtLevel(0), 1); - ASSERT_LT(SizeAtLevel(0), k64KB + k5KB); - ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2); + // look at the new live files after we added an 'extra' key + // and after we took the first snapshot. + uint64_t new_manifest_number = 0; + uint64_t new_manifest_size = 0; + std::vector newfiles; + dbfull()->DisableFileDeletions(); + dbfull()->GetLiveFiles(newfiles, &new_manifest_size); - // Clean up L0 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); + // find the new manifest file. assert that this manifest file is + // the same one as in the previous snapshot. But its size should be + // larger because we added an extra key after taking the + // previous shapshot. + for (size_t i = 0; i < newfiles.size(); i++) { + std::string src = dbname_ + "/" + newfiles[i]; + // record the lognumber and the size of the + // latest manifest file + if (ParseFileName(newfiles[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + if (number > new_manifest_number) { + uint64_t size; + new_manifest_number = number; + ASSERT_OK(env_->GetFileSize(src, &size)); + ASSERT_GE(size, new_manifest_size); + } + } + } + } + ASSERT_EQ(manifest_number, new_manifest_number); + ASSERT_GT(new_manifest_size, manifest_size); - // Increase buffer size - ASSERT_OK(dbfull()->SetOptions({ - {"write_buffer_size", "131072"}, - })); + // release file snapshot + dbfull()->DisableFileDeletions(); + } while (ChangeCompactOptions()); +} +#endif - // The existing memtable is still 64KB in size, after it becomes immutable, - // the next memtable will be 128KB in size. Write 256KB total, we should - // have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data - gen_l0_kb(256); - ASSERT_EQ(NumTableFilesAtLevel(0), 2); // (A) - ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB); - ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB); +TEST_F(DBTest, CompactOnFlush) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + Options options = CurrentOptions(options_override); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); - // Test max_write_buffer_number - // Block compaction thread, which will also block the flushes because - // max_background_flushes == 0, so flushes are getting executed by the - // compaction thread - env_->SetBackgroundThreads(1, Env::LOW); - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - // Start from scratch and disable compaction/flush. Flush can only happen - // during compaction but trigger is pretty high - options.max_background_flushes = 0; - options.disable_auto_compactions = true; - DestroyAndReopen(options); + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]"); - // Put until writes are stopped, bounded by 256 puts. We should see stop at - // ~128KB - int count = 0; - Random rnd(301); + // Write two new keys + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::DelayWrite:Wait", - [&](void* arg) { sleeping_task_low.WakeUp(); }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // Case1: Delete followed by a put + Delete(1, "foo"); + Put(1, "foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); - while (!sleeping_task_low.WokenUp() && count < 256) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); - count++; - } - ASSERT_GT(static_cast(count), 128 * 0.8); - ASSERT_LT(static_cast(count), 128 * 1.2); + // After the current memtable is flushed, the DEL should + // have been removed + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); - sleeping_task_low.WaitUntilDone(); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); - // Increase - ASSERT_OK(dbfull()->SetOptions({ - {"max_write_buffer_number", "8"}, - })); - // Clean up memtable and L0 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + // Case 2: Delete followed by another delete + Delete(1, "foo"); + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - sleeping_task_low.Reset(); - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - count = 0; - while (!sleeping_task_low.WokenUp() && count < 1024) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); - count++; - } - // Windows fails this test. Will tune in the future and figure out - // approp number -#ifndef OS_WIN - ASSERT_GT(static_cast(count), 512 * 0.8); - ASSERT_LT(static_cast(count), 512 * 1.2); -#endif - sleeping_task_low.WaitUntilDone(); + // Case 3: Put followed by a delete + Put(1, "foo", "v3"); + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - // Decrease - ASSERT_OK(dbfull()->SetOptions({ - {"max_write_buffer_number", "4"}, - })); - // Clean up memtable and L0 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + // Case 4: Put followed by another Put + Put(1, "foo", "v4"); + Put(1, "foo", "v5"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); - sleeping_task_low.Reset(); - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); + // clear database + Delete(1, "foo"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - count = 0; - while (!sleeping_task_low.WokenUp() && count < 1024) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); - count++; - } - // Windows fails this test. Will tune in the future and figure out - // approp number -#ifndef OS_WIN - ASSERT_GT(static_cast(count), 256 * 0.8); - ASSERT_LT(static_cast(count), 266 * 1.2); -#endif - sleeping_task_low.WaitUntilDone(); + // Case 5: Put followed by snapshot followed by another Put + // Both puts should remain. + Put(1, "foo", "v6"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put(1, "foo", "v7"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]"); + db_->ReleaseSnapshot(snapshot); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); -} -#endif // ROCKSDB_LITE + // clear database + Delete(1, "foo"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); -#if ROCKSDB_USING_THREAD_STATUS -namespace { -void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type, - int expected_count) { - int op_count = 0; - std::vector thread_list; - ASSERT_OK(env->GetThreadList(&thread_list)); - for (auto thread : thread_list) { - if (thread.operation_type == op_type) { - op_count++; - } - } - ASSERT_EQ(op_count, expected_count); + // Case 5: snapshot followed by a put followed by another Put + // Only the last put should remain. + const Snapshot* snapshot1 = db_->GetSnapshot(); + Put(1, "foo", "v8"); + Put(1, "foo", "v9"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]"); + db_->ReleaseSnapshot(snapshot1); + } while (ChangeCompactOptions()); } -} // namespace -TEST_F(DBTest, GetThreadStatus) { - Options options; - options.env = env_; - options.enable_thread_tracking = true; - TryReopen(options); +TEST_F(DBTest, FlushOneColumnFamily) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", + "alyosha", "popovich"}, + options); - std::vector thread_list; - Status s = env_->GetThreadList(&thread_list); + ASSERT_OK(Put(0, "Default", "Default")); + ASSERT_OK(Put(1, "pikachu", "pikachu")); + ASSERT_OK(Put(2, "ilya", "ilya")); + ASSERT_OK(Put(3, "muromec", "muromec")); + ASSERT_OK(Put(4, "dobrynia", "dobrynia")); + ASSERT_OK(Put(5, "nikitich", "nikitich")); + ASSERT_OK(Put(6, "alyosha", "alyosha")); + ASSERT_OK(Put(7, "popovich", "popovich")); - for (int i = 0; i < 2; ++i) { - // repeat the test with differet number of high / low priority threads - const int kTestCount = 3; - const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5}; - const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3}; - for (int test = 0; test < kTestCount; ++test) { - // Change the number of threads in high / low priority pool. - env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH); - env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW); - // Wait to ensure the all threads has been registered - env_->SleepForMicroseconds(100000); - s = env_->GetThreadList(&thread_list); - ASSERT_OK(s); - unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES]; - memset(thread_type_counts, 0, sizeof(thread_type_counts)); - for (auto thread : thread_list) { - ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES); - thread_type_counts[thread.thread_type]++; - } - // Verify the total number of threades - ASSERT_EQ( - thread_type_counts[ThreadStatus::HIGH_PRIORITY] + - thread_type_counts[ThreadStatus::LOW_PRIORITY], - kHighPriCounts[test] + kLowPriCounts[test]); - // Verify the number of high-priority threads - ASSERT_EQ( - thread_type_counts[ThreadStatus::HIGH_PRIORITY], - kHighPriCounts[test]); - // Verify the number of low-priority threads - ASSERT_EQ( - thread_type_counts[ThreadStatus::LOW_PRIORITY], - kLowPriCounts[test]); - } - if (i == 0) { - // repeat the test with multiple column families - CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); - env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap( - handles_, true); - } + for (int i = 0; i < 8; ++i) { + Flush(i); + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), i + 1U); } - db_->DropColumnFamily(handles_[2]); - delete handles_[2]; - handles_.erase(handles_.begin() + 2); - env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap( - handles_, true); - Close(); - env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap( - handles_, true); -} - -TEST_F(DBTest, DisableThreadStatus) { - Options options; - options.env = env_; - options.enable_thread_tracking = false; - TryReopen(options); - CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); - // Verify non of the column family info exists - env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap( - handles_, false); } -TEST_F(DBTest, ThreadStatusFlush) { - Options options; - options.env = env_; - options.write_buffer_size = 100000; // Small write buffer - options.enable_thread_tracking = true; - options = CurrentOptions(options); +#ifndef ROCKSDB_LITE +TEST_F(DBTest, SharedWriteBuffer) { + Options options = CurrentOptions(); + options.db_write_buffer_size = 100000; // this is the real limit + options.write_buffer_size = 500000; // this is never hit + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); - rocksdb::SyncPoint::GetInstance()->LoadDependency({ - {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"}, - {"DBTest::ThreadStatusFlush:2", - "FlushJob::LogAndNotifyTableFileCreation()"}, - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // Trigger a flush on CF "nikitich" + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(3, Key(1), DummyString(90000))); + ASSERT_OK(Put(2, Key(2), DummyString(20000))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } - CreateAndReopenWithCF({"pikachu"}, options); - VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0); + // "dobrynia": 20KB + // Flush 'dobrynia' + ASSERT_OK(Put(3, Key(2), DummyString(40000))); + ASSERT_OK(Put(2, Key(2), DummyString(70000))); + ASSERT_OK(Put(0, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } - ASSERT_OK(Put(1, "foo", "v1")); - ASSERT_EQ("v1", Get(1, "foo")); - VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0); + // "nikitich" still has has data of 80KB + // Inserting Data in "dobrynia" triggers "nikitich" flushing. + ASSERT_OK(Put(3, Key(2), DummyString(40000))); + ASSERT_OK(Put(2, Key(2), DummyString(40000))); + ASSERT_OK(Put(0, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } - uint64_t num_running_flushes = 0; - db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes); - ASSERT_EQ(num_running_flushes, 0); + // "dobrynia" still has 40KB + ASSERT_OK(Put(1, Key(2), DummyString(20000))); + ASSERT_OK(Put(0, Key(1), DummyString(10000))); + ASSERT_OK(Put(0, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // This should triggers no flush + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } - Put(1, "k1", std::string(100000, 'x')); // Fill memtable - Put(1, "k2", std::string(100000, 'y')); // Trigger flush + // "default": 10KB, "pikachu": 20KB, "dobrynia": 40KB + ASSERT_OK(Put(1, Key(2), DummyString(40000))); + ASSERT_OK(Put(0, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // This should triggers flush of "pikachu" + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } - // The first sync point is to make sure there's one flush job - // running when we perform VerifyOperationCount(). - TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1"); - VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1); - db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes); - ASSERT_EQ(num_running_flushes, 1); - // This second sync point is to ensure the flush job will not - // be completed until we already perform VerifyOperationCount(). - TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2"); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + // "default": 10KB, "dobrynia": 40KB + // Some remaining writes so 'default', 'dobrynia' and 'nikitich' flush on + // closure. + ASSERT_OK(Put(3, Key(1), DummyString(1))); + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + options); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(3)); + } } +#endif // ROCKSDB_LITE -TEST_P(DBTestWithParam, ThreadStatusSingleCompaction) { - const int kTestKeySize = 16; - const int kTestValueSize = 984; - const int kEntrySize = kTestKeySize + kTestValueSize; - const int kEntriesPerBuffer = 100; - Options options; +TEST_F(DBTest, PurgeInfoLogs) { + Options options = CurrentOptions(); + options.keep_log_file_num = 5; options.create_if_missing = true; - options.write_buffer_size = kEntrySize * kEntriesPerBuffer; - options.compaction_style = kCompactionStyleLevel; - options.target_file_size_base = options.write_buffer_size; - options.max_bytes_for_level_base = options.target_file_size_base * 2; - options.max_bytes_for_level_multiplier = 2; - options.compression = kNoCompression; - options = CurrentOptions(options); - options.env = env_; - options.enable_thread_tracking = true; - const int kNumL0Files = 4; - options.level0_file_num_compaction_trigger = kNumL0Files; - options.max_subcompactions = max_subcompactions_; - - rocksdb::SyncPoint::GetInstance()->LoadDependency({ - {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"}, - {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"}, - {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"}, - }); - for (int tests = 0; tests < 2; ++tests) { - DestroyAndReopen(options); - rocksdb::SyncPoint::GetInstance()->ClearTrace(); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + for (int mode = 0; mode <= 1; mode++) { + if (mode == 1) { + options.db_log_dir = dbname_ + "_logs"; + env_->CreateDirIfMissing(options.db_log_dir); + } else { + options.db_log_dir = ""; + } + for (int i = 0; i < 8; i++) { + Reopen(options); + } - Random rnd(301); - // The Put Phase. - for (int file = 0; file < kNumL0Files; ++file) { - for (int key = 0; key < kEntriesPerBuffer; ++key) { - ASSERT_OK(Put(ToString(key + file * kEntriesPerBuffer), - RandomString(&rnd, kTestValueSize))); + std::vector files; + env_->GetChildren(options.db_log_dir.empty() ? dbname_ : options.db_log_dir, + &files); + int info_log_count = 0; + for (std::string file : files) { + if (file.find("LOG") != std::string::npos) { + info_log_count++; } - Flush(); } - // This makes sure a compaction won't be scheduled until - // we have done with the above Put Phase. - uint64_t num_running_compactions = 0; - db_->GetIntProperty(DB::Properties::kNumRunningCompactions, - &num_running_compactions); - ASSERT_EQ(num_running_compactions, 0); - TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0"); - ASSERT_GE(NumTableFilesAtLevel(0), - options.level0_file_num_compaction_trigger); - - // This makes sure at least one compaction is running. - TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1"); + ASSERT_EQ(5, info_log_count); - if (options.enable_thread_tracking) { - // expecting one single L0 to L1 compaction - VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1); - } else { - // If thread tracking is not enabled, compaction count should be 0. - VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0); + Destroy(options); + // For mode (1), test DestroyDB() to delete all the logs under DB dir. + // For mode (2), no info log file should have been put under DB dir. + std::vector db_files; + env_->GetChildren(dbname_, &db_files); + for (std::string file : db_files) { + ASSERT_TRUE(file.find("LOG") == std::string::npos); } - db_->GetIntProperty(DB::Properties::kNumRunningCompactions, - &num_running_compactions); - ASSERT_EQ(num_running_compactions, 1); - // TODO(yhchiang): adding assert to verify each compaction stage. - TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2"); - // repeat the test with disabling thread tracking. - options.enable_thread_tracking = false; - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + if (mode == 1) { + // Cleaning up + env_->GetChildren(options.db_log_dir, &files); + for (std::string file : files) { + env_->DeleteFile(options.db_log_dir + "/" + file); + } + env_->DeleteDir(options.db_log_dir); + } } } -TEST_P(DBTestWithParam, PreShutdownManualCompaction) { - Options options = CurrentOptions(); - options.max_background_flushes = 0; - options.max_subcompactions = max_subcompactions_; - CreateAndReopenWithCF({"pikachu"}, options); - - // iter - 0 with 7 levels - // iter - 1 with 3 levels - for (int iter = 0; iter < 2; ++iter) { - MakeTables(3, "p", "q", 1); - ASSERT_EQ("1,1,1", FilesPerLevel(1)); - - // Compaction range falls before files - Compact(1, "", "c"); - ASSERT_EQ("1,1,1", FilesPerLevel(1)); - - // Compaction range falls after files - Compact(1, "r", "z"); - ASSERT_EQ("1,1,1", FilesPerLevel(1)); - - // Compaction range overlaps files - Compact(1, "p1", "p9"); - ASSERT_EQ("0,0,1", FilesPerLevel(1)); - - // Populate a different range - MakeTables(3, "c", "e", 1); - ASSERT_EQ("1,1,2", FilesPerLevel(1)); - - // Compact just the new range - Compact(1, "b", "f"); - ASSERT_EQ("0,0,2", FilesPerLevel(1)); - - // Compact all - MakeTables(1, "a", "z", 1); - ASSERT_EQ("1,0,2", FilesPerLevel(1)); - CancelAllBackgroundWork(db_); - db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); - ASSERT_EQ("1,0,2", FilesPerLevel(1)); +#ifndef ROCKSDB_LITE +// Multi-threaded test: +namespace { - if (iter == 0) { - options = CurrentOptions(); - options.max_background_flushes = 0; - options.num_levels = 3; - options.create_if_missing = true; - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - } - } -} +static const int kColumnFamilies = 10; +static const int kNumThreads = 10; +static const int kTestSeconds = 10; +static const int kNumKeys = 1000; -TEST_F(DBTest, PreShutdownFlush) { - Options options = CurrentOptions(); - options.max_background_flushes = 0; - CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_OK(Put(1, "key", "value")); - CancelAllBackgroundWork(db_); - Status s = - db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); - ASSERT_TRUE(s.IsShutdownInProgress()); -} +struct MTState { + DBTest* test; + std::atomic stop; + std::atomic counter[kNumThreads]; + std::atomic thread_done[kNumThreads]; +}; -TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) { - const int kTestKeySize = 16; - const int kTestValueSize = 984; - const int kEntrySize = kTestKeySize + kTestValueSize; - const int kEntriesPerBuffer = 40; - const int kNumL0Files = 4; +struct MTThread { + MTState* state; + int id; +}; - const int kHighPriCount = 3; - const int kLowPriCount = 5; - env_->SetBackgroundThreads(kHighPriCount, Env::HIGH); - env_->SetBackgroundThreads(kLowPriCount, Env::LOW); +static void MTThreadBody(void* arg) { + MTThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->state->test->db_; + int counter = 0; + fprintf(stderr, "... starting thread %d\n", id); + Random rnd(1000 + id); + char valbuf[1500]; + while (t->state->stop.load(std::memory_order_acquire) == false) { + t->state->counter[id].store(counter, std::memory_order_release); - Options options; - options.create_if_missing = true; - options.write_buffer_size = kEntrySize * kEntriesPerBuffer; - options.compaction_style = kCompactionStyleLevel; - options.target_file_size_base = options.write_buffer_size; - options.max_bytes_for_level_base = - options.target_file_size_base * kNumL0Files; - options.compression = kNoCompression; - options = CurrentOptions(options); - options.env = env_; - options.enable_thread_tracking = true; - options.level0_file_num_compaction_trigger = kNumL0Files; - options.max_bytes_for_level_multiplier = 2; - options.max_background_compactions = kLowPriCount; - options.level0_stop_writes_trigger = 1 << 10; - options.level0_slowdown_writes_trigger = 1 << 10; - options.max_subcompactions = max_subcompactions_; + int key = rnd.Uniform(kNumKeys); + char keybuf[20]; + snprintf(keybuf, sizeof(keybuf), "%016d", key); - TryReopen(options); - Random rnd(301); + if (rnd.OneIn(2)) { + // Write values of the form . + // into each of the CFs + // We add some padding for force compactions. + int unique_id = rnd.Uniform(1000000); - std::vector thread_list; - // Delay both flush and compaction - rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"}, - {"CompactionJob::Run():Start", - "DBTest::PreShutdownMultipleCompaction:Preshutdown"}, - {"CompactionJob::Run():Start", - "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"}, - {"DBTest::PreShutdownMultipleCompaction:Preshutdown", - "CompactionJob::Run():End"}, - {"CompactionJob::Run():End", - "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}}); + // Half of the time directly use WriteBatch. Half of the time use + // WriteBatchWithIndex. + if (rnd.OneIn(2)) { + WriteBatch batch; + for (int cf = 0; cf < kColumnFamilies; ++cf) { + snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, + static_cast(counter), cf, unique_id); + batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); + } + ASSERT_OK(db->Write(WriteOptions(), &batch)); + } else { + WriteBatchWithIndex batch(db->GetOptions().comparator); + for (int cf = 0; cf < kColumnFamilies; ++cf) { + snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, + static_cast(counter), cf, unique_id); + batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); + } + ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch())); + } + } else { + // Read a value and verify that it matches the pattern written above + // and that writes to all column families were atomic (unique_id is the + // same) + std::vector keys(kColumnFamilies, Slice(keybuf)); + std::vector values; + std::vector statuses = + db->MultiGet(ReadOptions(), t->state->test->handles_, keys, &values); + Status s = statuses[0]; + // all statuses have to be the same + for (size_t i = 1; i < statuses.size(); ++i) { + // they are either both ok or both not-found + ASSERT_TRUE((s.ok() && statuses[i].ok()) || + (s.IsNotFound() && statuses[i].IsNotFound())); + } + if (s.IsNotFound()) { + // Key has not yet been written + } else { + // Check that the writer thread counter is >= the counter in the value + ASSERT_OK(s); + int unique_id = -1; + for (int i = 0; i < kColumnFamilies; ++i) { + int k, w, c, cf, u; + ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, &c, + &cf, &u)) + << values[i]; + ASSERT_EQ(k, key); + ASSERT_GE(w, 0); + ASSERT_LT(w, kNumThreads); + ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire)); + ASSERT_EQ(cf, i); + if (i == 0) { + unique_id = u; + } else { + // this checks that updates across column families happened + // atomically -- all unique ids are the same + ASSERT_EQ(u, unique_id); + } + } + } + } + counter++; + } + t->state->thread_done[id].store(true, std::memory_order_release); + fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); +} - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +} // namespace - // Make rocksdb busy - int key = 0; - // check how many threads are doing compaction using GetThreadList - int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; - for (int file = 0; file < 16 * kNumL0Files; ++file) { - for (int k = 0; k < kEntriesPerBuffer; ++k) { - ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize))); - } +class MultiThreadedDBTest : public DBTest, + public ::testing::WithParamInterface { + public: + virtual void SetUp() override { option_config_ = GetParam(); } - Status s = env_->GetThreadList(&thread_list); - for (auto thread : thread_list) { - operation_count[thread.operation_type]++; + static std::vector GenerateOptionConfigs() { + std::vector optionConfigs; + for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) { + // skip as HashCuckooRep does not support snapshot + if (optionConfig != kHashCuckoo) { + optionConfigs.push_back(optionConfig); + } } + return optionConfigs; + } +}; - // Speed up the test - if (operation_count[ThreadStatus::OP_FLUSH] > 1 && - operation_count[ThreadStatus::OP_COMPACTION] > - 0.6 * options.max_background_compactions) { - break; - } - if (file == 15 * kNumL0Files) { - TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown"); - } +TEST_P(MultiThreadedDBTest, MultiThreaded) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + std::vector cfs; + for (int i = 1; i < kColumnFamilies; ++i) { + cfs.push_back(ToString(i)); + } + CreateAndReopenWithCF(cfs, CurrentOptions(options_override)); + // Initialize state + MTState mt; + mt.test = this; + mt.stop.store(false, std::memory_order_release); + for (int id = 0; id < kNumThreads; id++) { + mt.counter[id].store(0, std::memory_order_release); + mt.thread_done[id].store(false, std::memory_order_release); } - TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown"); - ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); - CancelAllBackgroundWork(db_); - TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"); - dbfull()->TEST_WaitForCompact(); - // Record the number of compactions at a time. - for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { - operation_count[i] = 0; + // Start threads + MTThread thread[kNumThreads]; + for (int id = 0; id < kNumThreads; id++) { + thread[id].state = &mt; + thread[id].id = id; + env_->StartThread(MTThreadBody, &thread[id]); } - Status s = env_->GetThreadList(&thread_list); - for (auto thread : thread_list) { - operation_count[thread.operation_type]++; + + // Let them run for a while + env_->SleepForMicroseconds(kTestSeconds * 1000000); + + // Stop the threads and wait for them to finish + mt.stop.store(true, std::memory_order_release); + for (int id = 0; id < kNumThreads; id++) { + while (mt.thread_done[id].load(std::memory_order_acquire) == false) { + env_->SleepForMicroseconds(100000); + } } - ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); } -TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) { - const int kTestKeySize = 16; - const int kTestValueSize = 984; - const int kEntrySize = kTestKeySize + kTestValueSize; - const int kEntriesPerBuffer = 40; - const int kNumL0Files = 4; +INSTANTIATE_TEST_CASE_P( + MultiThreaded, MultiThreadedDBTest, + ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs())); +#endif // ROCKSDB_LITE - const int kHighPriCount = 3; - const int kLowPriCount = 5; - env_->SetBackgroundThreads(kHighPriCount, Env::HIGH); - env_->SetBackgroundThreads(kLowPriCount, Env::LOW); +// Group commit test: +namespace { - Options options; - options.create_if_missing = true; - options.write_buffer_size = kEntrySize * kEntriesPerBuffer; - options.compaction_style = kCompactionStyleLevel; - options.target_file_size_base = options.write_buffer_size; - options.max_bytes_for_level_base = - options.target_file_size_base * kNumL0Files; - options.compression = kNoCompression; - options = CurrentOptions(options); - options.env = env_; - options.enable_thread_tracking = true; - options.level0_file_num_compaction_trigger = kNumL0Files; - options.max_bytes_for_level_multiplier = 2; - options.max_background_compactions = kLowPriCount; - options.level0_stop_writes_trigger = 1 << 10; - options.level0_slowdown_writes_trigger = 1 << 10; - options.max_subcompactions = max_subcompactions_; +static const int kGCNumThreads = 4; +static const int kGCNumKeys = 1000; - TryReopen(options); - Random rnd(301); +struct GCThread { + DB* db; + int id; + std::atomic done; +}; - std::vector thread_list; - // Delay both flush and compaction - rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"DBTest::PreShutdownCompactionMiddle:Preshutdown", - "CompactionJob::Run():Inprogress"}, - {"CompactionJob::Run():Start", - "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"}, - {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"}, - {"CompactionJob::Run():End", - "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}}); +static void GCThreadBody(void* arg) { + GCThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->db; + WriteOptions wo; - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + for (int i = 0; i < kGCNumKeys; ++i) { + std::string kv(ToString(i + id * kGCNumKeys)); + ASSERT_OK(db->Put(wo, kv, kv)); + } + t->done = true; +} - // Make rocksdb busy - int key = 0; - // check how many threads are doing compaction using GetThreadList - int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; - for (int file = 0; file < 16 * kNumL0Files; ++file) { - for (int k = 0; k < kEntriesPerBuffer; ++k) { - ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize))); +} // namespace + +TEST_F(DBTest, GroupCommitTest) { + do { + Options options = CurrentOptions(); + options.env = env_; + env_->log_write_slowdown_.store(100); + options.statistics = rocksdb::CreateDBStatistics(); + Reopen(options); + + // Start threads + GCThread thread[kGCNumThreads]; + for (int id = 0; id < kGCNumThreads; id++) { + thread[id].id = id; + thread[id].db = db_; + thread[id].done = false; + env_->StartThread(GCThreadBody, &thread[id]); } - Status s = env_->GetThreadList(&thread_list); - for (auto thread : thread_list) { - operation_count[thread.operation_type]++; + for (int id = 0; id < kGCNumThreads; id++) { + while (thread[id].done == false) { + env_->SleepForMicroseconds(100000); + } } + env_->log_write_slowdown_.store(0); - // Speed up the test - if (operation_count[ThreadStatus::OP_FLUSH] > 1 && - operation_count[ThreadStatus::OP_COMPACTION] > - 0.6 * options.max_background_compactions) { - break; + ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0); + + std::vector expected_db; + for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) { + expected_db.push_back(ToString(i)); } - if (file == 15 * kNumL0Files) { - TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction"); + sort(expected_db.begin(), expected_db.end()); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + for (auto x : expected_db) { + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ(itr->key().ToString(), x); + ASSERT_EQ(itr->value().ToString(), x); + itr->Next(); } - } + ASSERT_TRUE(!itr->Valid()); + delete itr; - ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); - CancelAllBackgroundWork(db_); - TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown"); - TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"); - dbfull()->TEST_WaitForCompact(); - // Record the number of compactions at a time. - for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { - operation_count[i] = 0; - } - Status s = env_->GetThreadList(&thread_list); - for (auto thread : thread_list) { - operation_count[thread.operation_type]++; - } - ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); + HistogramData hist_data = {0, 0, 0, 0, 0}; + options.statistics->histogramData(DB_WRITE, &hist_data); + ASSERT_GT(hist_data.average, 0.0); + } while (ChangeOptions(kSkipNoSeekToLast)); } -#endif // ROCKSDB_USING_THREAD_STATUS - -#ifndef ROCKSDB_LITE -TEST_F(DBTest, FlushOnDestroy) { - WriteOptions wo; - wo.disableWAL = true; - ASSERT_OK(Put("foo", "v1", wo)); - CancelAllBackgroundWork(db_); +namespace { +typedef std::map KVMap; } -namespace { -class OnFileDeletionListener : public EventListener { +class ModelDB : public DB { public: - OnFileDeletionListener() : - matched_count_(0), - expected_file_name_("") {} + class ModelSnapshot : public Snapshot { + public: + KVMap map_; - void SetExpectedFileName( - const std::string file_name) { - expected_file_name_ = file_name; - } + virtual SequenceNumber GetSequenceNumber() const override { + // no need to call this + assert(false); + return 0; + } + }; - void VerifyMatchedCount(size_t expected_value) { - ASSERT_EQ(matched_count_, expected_value); + explicit ModelDB(const Options& options) : options_(options) {} + using DB::Put; + virtual Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& k, const Slice& v) override { + WriteBatch batch; + batch.Put(cf, k, v); + return Write(o, &batch); } - - void OnTableFileDeleted( - const TableFileDeletionInfo& info) override { - if (expected_file_name_ != "") { - ASSERT_EQ(expected_file_name_, info.file_path); - expected_file_name_ = ""; - matched_count_++; - } + using DB::Delete; + virtual Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& key) override { + WriteBatch batch; + batch.Delete(cf, key); + return Write(o, &batch); } - - private: - size_t matched_count_; - std::string expected_file_name_; -}; - -} // namespace - -TEST_F(DBTest, DynamicLevelCompressionPerLevel) { - if (!Snappy_Supported()) { - return; + using DB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& key) override { + WriteBatch batch; + batch.SingleDelete(cf, key); + return Write(o, &batch); } - const int kNKeys = 120; - int keys[kNKeys]; - for (int i = 0; i < kNKeys; i++) { - keys[i] = i; + using DB::Merge; + virtual Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& k, const Slice& v) override { + WriteBatch batch; + batch.Merge(cf, k, v); + return Write(o, &batch); + } + using DB::Get; + virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* cf, + const Slice& key, std::string* value) override { + return Status::NotSupported(key); } - std::random_shuffle(std::begin(keys), std::end(keys)); - - Random rnd(301); - Options options; - options.create_if_missing = true; - options.db_write_buffer_size = 20480; - options.write_buffer_size = 20480; - options.max_write_buffer_number = 2; - options.level0_file_num_compaction_trigger = 2; - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 2; - options.target_file_size_base = 2048; - options.level_compaction_dynamic_level_bytes = true; - options.max_bytes_for_level_base = 102400; - options.max_bytes_for_level_multiplier = 4; - options.max_background_compactions = 1; - options.num_levels = 5; - - options.compression_per_level.resize(3); - options.compression_per_level[0] = kNoCompression; - options.compression_per_level[1] = kNoCompression; - options.compression_per_level[2] = kSnappyCompression; - OnFileDeletionListener* listener = new OnFileDeletionListener(); - options.listeners.emplace_back(listener); + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override { + std::vector s(keys.size(), + Status::NotSupported("Not implemented.")); + return s; + } - DestroyAndReopen(options); +#ifndef ROCKSDB_LITE + using DB::AddFile; + virtual Status AddFile(ColumnFamilyHandle* column_family, + const ExternalSstFileInfo* file_path, + bool move_file) override { + return Status::NotSupported("Not implemented."); + } + virtual Status AddFile(ColumnFamilyHandle* column_family, + const std::string& file_path, + bool move_file) override { + return Status::NotSupported("Not implemented."); + } - // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should - // be compressed, so total data size should be more than 80K. - for (int i = 0; i < 20; i++) { - ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables( + ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override { + return Status(); } - Flush(); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); - ASSERT_EQ(NumTableFilesAtLevel(3), 0); - ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U); + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) override { + return Status(); + } +#endif // ROCKSDB_LITE - // Insert 400KB. Some data will be compressed - for (int i = 21; i < 120; i++) { - ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); + using DB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, + bool* value_found = nullptr) override { + if (value_found != nullptr) { + *value_found = false; + } + return true; // Not Supported directly } - Flush(); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); - ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4), 120U * 4000U); - // Make sure data in files in L3 is not compacted by removing all files - // in L4 and calculate number of rows - ASSERT_OK(dbfull()->SetOptions({ - {"disable_auto_compactions", "true"}, - })); - ColumnFamilyMetaData cf_meta; - db_->GetColumnFamilyMetaData(&cf_meta); - for (auto file : cf_meta.levels[4].files) { - listener->SetExpectedFileName(dbname_ + file.name); - ASSERT_OK(dbfull()->DeleteFile(file.name)); + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override { + if (options.snapshot == nullptr) { + KVMap* saved = new KVMap; + *saved = map_; + return new ModelIter(saved, true); + } else { + const KVMap* snapshot_state = + &(reinterpret_cast(options.snapshot)->map_); + return new ModelIter(snapshot_state, false); + } } - listener->VerifyMatchedCount(cf_meta.levels[4].files.size()); - - int num_keys = 0; - std::unique_ptr iter(db_->NewIterator(ReadOptions())); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - num_keys++; + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_family, + std::vector* iterators) override { + return Status::NotSupported("Not supported yet"); + } + virtual const Snapshot* GetSnapshot() override { + ModelSnapshot* snapshot = new ModelSnapshot; + snapshot->map_ = map_; + return snapshot; } - ASSERT_OK(iter->status()); - ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U); -} -TEST_F(DBTest, DynamicLevelCompressionPerLevel2) { - if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) { - return; + virtual void ReleaseSnapshot(const Snapshot* snapshot) override { + delete reinterpret_cast(snapshot); } - const int kNKeys = 500; - int keys[kNKeys]; - for (int i = 0; i < kNKeys; i++) { - keys[i] = i; + + virtual Status Write(const WriteOptions& options, + WriteBatch* batch) override { + class Handler : public WriteBatch::Handler { + public: + KVMap* map_; + virtual void Put(const Slice& key, const Slice& value) override { + (*map_)[key.ToString()] = value.ToString(); + } + virtual void Merge(const Slice& key, const Slice& value) override { + // ignore merge for now + // (*map_)[key.ToString()] = value.ToString(); + } + virtual void Delete(const Slice& key) override { + map_->erase(key.ToString()); + } + }; + Handler handler; + handler.map_ = &map_; + return batch->Iterate(&handler); } - std::random_shuffle(std::begin(keys), std::end(keys)); - Random rnd(301); - Options options; - options.create_if_missing = true; - options.db_write_buffer_size = 6000; - options.write_buffer_size = 6000; - options.max_write_buffer_number = 2; - options.level0_file_num_compaction_trigger = 2; - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 2; - options.soft_pending_compaction_bytes_limit = 1024 * 1024; + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) override { + return false; + } + using DB::GetIntProperty; + virtual bool GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) override { + return false; + } + using DB::GetAggregatedIntProperty; + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* value) override { + return false; + } + using DB::GetApproximateSizes; + virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes, + bool include_memtable) override { + for (int i = 0; i < n; i++) { + sizes[i] = 0; + } + } + using DB::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* start, const Slice* end) override { + return Status::NotSupported("Not supported operation."); + } - // Use file size to distinguish levels - // L1: 10, L2: 20, L3 40, L4 80 - // L0 is less than 30 - options.target_file_size_base = 10; - options.target_file_size_multiplier = 2; + using DB::CompactFiles; + virtual Status CompactFiles(const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, + const int output_level, + const int output_path_id = -1) override { + return Status::NotSupported("Not supported operation."); + } - options.level_compaction_dynamic_level_bytes = true; - options.max_bytes_for_level_base = 200; - options.max_bytes_for_level_multiplier = 8; - options.max_background_compactions = 1; - options.num_levels = 5; - std::shared_ptr mtf(new mock::MockTableFactory); - options.table_factory = mtf; + Status PauseBackgroundWork() override { + return Status::NotSupported("Not supported operation."); + } - options.compression_per_level.resize(3); - options.compression_per_level[0] = kNoCompression; - options.compression_per_level[1] = kLZ4Compression; - options.compression_per_level[2] = kZlibCompression; + Status ContinueBackgroundWork() override { + return Status::NotSupported("Not supported operation."); + } - DestroyAndReopen(options); - // When base level is L4, L4 is LZ4. - std::atomic num_zlib(0); - std::atomic num_lz4(0); - std::atomic num_no(0); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); - if (compaction->output_level() == 4) { - ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); - num_lz4.fetch_add(1); - } - }); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { - auto* compression = reinterpret_cast(arg); - ASSERT_TRUE(*compression == kNoCompression); - num_no.fetch_add(1); - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + Status EnableAutoCompaction( + const std::vector& column_family_handles) override { + return Status::NotSupported("Not supported operation."); + } - for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200))); + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) override { + return 1; + } - if (i % 25 == 0) { - dbfull()->TEST_WaitForFlushMemTable(); - } + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel( + ColumnFamilyHandle* column_family) override { + return 1; } - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger( + ColumnFamilyHandle* column_family) override { + return -1; + } - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); - ASSERT_EQ(NumTableFilesAtLevel(3), 0); - ASSERT_GT(NumTableFilesAtLevel(4), 0); - ASSERT_GT(num_no.load(), 2); - ASSERT_GT(num_lz4.load(), 0); - int prev_num_files_l4 = NumTableFilesAtLevel(4); + virtual const std::string& GetName() const override { return name_; } - // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib - num_lz4.store(0); - num_no.store(0); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); - if (compaction->output_level() == 4 && compaction->start_level() == 3) { - ASSERT_TRUE(compaction->output_compression() == kZlibCompression); - num_zlib.fetch_add(1); - } else { - ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); - num_lz4.fetch_add(1); - } - }); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { - auto* compression = reinterpret_cast(arg); - ASSERT_TRUE(*compression == kNoCompression); - num_no.fetch_add(1); - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + virtual Env* GetEnv() const override { return nullptr; } - for (int i = 101; i < 500; i++) { - ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200))); - if (i % 100 == 99) { - Flush(); - dbfull()->TEST_WaitForCompact(); - } + using DB::GetOptions; + virtual const Options& GetOptions( + ColumnFamilyHandle* column_family) const override { + return options_; } - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); - ASSERT_GT(NumTableFilesAtLevel(3), 0); - ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4); - ASSERT_GT(num_no.load(), 2); - ASSERT_GT(num_lz4.load(), 0); - ASSERT_GT(num_zlib.load(), 0); -} + using DB::GetDBOptions; + virtual const DBOptions& GetDBOptions() const override { return options_; } -TEST_F(DBTest, DynamicCompactionOptions) { - // minimum write buffer size is enforced at 64KB - const uint64_t k32KB = 1 << 15; - const uint64_t k64KB = 1 << 16; - const uint64_t k128KB = 1 << 17; - const uint64_t k1MB = 1 << 20; - const uint64_t k4KB = 1 << 12; - Options options; - options.env = env_; - options.create_if_missing = true; - options.compression = kNoCompression; - options.soft_pending_compaction_bytes_limit = 1024 * 1024; - options.write_buffer_size = k64KB; - options.arena_block_size = 4 * k4KB; - options.max_write_buffer_number = 2; - // Compaction related options - options.level0_file_num_compaction_trigger = 3; - options.level0_slowdown_writes_trigger = 4; - options.level0_stop_writes_trigger = 8; - options.max_grandparent_overlap_factor = 10; - options.expanded_compaction_factor = 25; - options.source_compaction_factor = 1; - options.target_file_size_base = k64KB; - options.target_file_size_multiplier = 1; - options.max_bytes_for_level_base = k128KB; - options.max_bytes_for_level_multiplier = 4; + using DB::Flush; + virtual Status Flush(const rocksdb::FlushOptions& options, + ColumnFamilyHandle* column_family) override { + Status ret; + return ret; + } - // Block flush thread and disable compaction thread - env_->SetBackgroundThreads(1, Env::LOW); - env_->SetBackgroundThreads(1, Env::HIGH); - DestroyAndReopen(options); + virtual Status SyncWAL() override { return Status::OK(); } - auto gen_l0_kb = [this](int start, int size, int stride) { - Random rnd(301); - for (int i = 0; i < size; i++) { - ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024))); - } - dbfull()->TEST_WaitForFlushMemTable(); - }; +#ifndef ROCKSDB_LITE + virtual Status DisableFileDeletions() override { return Status::OK(); } - // Write 3 files that have the same key range. - // Since level0_file_num_compaction_trigger is 3, compaction should be - // triggered. The compaction should result in one L1 file - gen_l0_kb(0, 64, 1); - ASSERT_EQ(NumTableFilesAtLevel(0), 1); - gen_l0_kb(0, 64, 1); - ASSERT_EQ(NumTableFilesAtLevel(0), 2); - gen_l0_kb(0, 64, 1); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("0,1", FilesPerLevel()); - std::vector metadata; - db_->GetLiveFilesMetaData(&metadata); - ASSERT_EQ(1U, metadata.size()); - ASSERT_LE(metadata[0].size, k64KB + k4KB); - ASSERT_GE(metadata[0].size, k64KB - k4KB); + virtual Status EnableFileDeletions(bool force) override { + return Status::OK(); + } + virtual Status GetLiveFiles(std::vector&, uint64_t* size, + bool flush_memtable = true) override { + return Status::OK(); + } - // Test compaction trigger and target_file_size_base - // Reduce compaction trigger to 2, and reduce L1 file size to 32KB. - // Writing to 64KB L0 files should trigger a compaction. Since these - // 2 L0 files have the same key range, compaction merge them and should - // result in 2 32KB L1 files. - ASSERT_OK(dbfull()->SetOptions({ - {"level0_file_num_compaction_trigger", "2"}, - {"target_file_size_base", ToString(k32KB) } - })); + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + return Status::OK(); + } - gen_l0_kb(0, 64, 1); - ASSERT_EQ("1,1", FilesPerLevel()); - gen_l0_kb(0, 64, 1); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("0,2", FilesPerLevel()); - metadata.clear(); - db_->GetLiveFilesMetaData(&metadata); - ASSERT_EQ(2U, metadata.size()); - ASSERT_LE(metadata[0].size, k32KB + k4KB); - ASSERT_GE(metadata[0].size, k32KB - k4KB); - ASSERT_LE(metadata[1].size, k32KB + k4KB); - ASSERT_GE(metadata[1].size, k32KB - k4KB); + virtual Status DeleteFile(std::string name) override { return Status::OK(); } - // Test max_bytes_for_level_base - // Increase level base size to 256KB and write enough data that will - // fill L1 and L2. L1 size should be around 256KB while L2 size should be - // around 256KB x 4. - ASSERT_OK(dbfull()->SetOptions({ - {"max_bytes_for_level_base", ToString(k1MB) } - })); + virtual Status GetUpdatesSince( + rocksdb::SequenceNumber, unique_ptr*, + const TransactionLogIterator::ReadOptions& read_options = + TransactionLogIterator::ReadOptions()) override { + return Status::NotSupported("Not supported in Model DB"); + } - // writing 96 x 64KB => 6 * 1024KB - // (L1 + L2) = (1 + 4) * 1024KB - for (int i = 0; i < 96; ++i) { - gen_l0_kb(i, 64, 96); + virtual void GetColumnFamilyMetaData( + ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* metadata) override {} +#endif // ROCKSDB_LITE + + virtual Status GetDbIdentity(std::string& identity) const override { + return Status::OK(); } - dbfull()->TEST_WaitForCompact(); - ASSERT_GT(SizeAtLevel(1), k1MB / 2); - ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2); - // Within (0.5, 1.5) of 4MB. - ASSERT_GT(SizeAtLevel(2), 2 * k1MB); - ASSERT_LT(SizeAtLevel(2), 6 * k1MB); + virtual SequenceNumber GetLatestSequenceNumber() const override { return 0; } + + virtual ColumnFamilyHandle* DefaultColumnFamily() const override { + return nullptr; + } + + private: + class ModelIter : public Iterator { + public: + ModelIter(const KVMap* map, bool owned) + : map_(map), owned_(owned), iter_(map_->end()) {} + ~ModelIter() { + if (owned_) delete map_; + } + virtual bool Valid() const override { return iter_ != map_->end(); } + virtual void SeekToFirst() override { iter_ = map_->begin(); } + virtual void SeekToLast() override { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + virtual void Seek(const Slice& k) override { + iter_ = map_->lower_bound(k.ToString()); + } + virtual void Next() override { ++iter_; } + virtual void Prev() override { + if (iter_ == map_->begin()) { + iter_ = map_->end(); + return; + } + --iter_; + } + + virtual Slice key() const override { return iter_->first; } + virtual Slice value() const override { return iter_->second; } + virtual Status status() const override { return Status::OK(); } - // Test max_bytes_for_level_multiplier and - // max_bytes_for_level_base. Now, reduce both mulitplier and level base, - // After filling enough data that can fit in L1 - L3, we should see L1 size - // reduces to 128KB from 256KB which was asserted previously. Same for L2. - ASSERT_OK(dbfull()->SetOptions({ - {"max_bytes_for_level_multiplier", "2"}, - {"max_bytes_for_level_base", ToString(k128KB) } - })); + private: + const KVMap* const map_; + const bool owned_; // Do we own map_ + KVMap::const_iterator iter_; + }; + const Options options_; + KVMap map_; + std::string name_ = ""; +}; - // writing 20 x 64KB = 10 x 128KB - // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB - for (int i = 0; i < 20; ++i) { - gen_l0_kb(i, 64, 32); - } - dbfull()->TEST_WaitForCompact(); - uint64_t total_size = - SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3); - ASSERT_TRUE(total_size < k128KB * 7 * 1.5); +static std::string RandomKey(Random* rnd, int minimum = 0) { + int len; + do { + len = (rnd->OneIn(3) + ? 1 // Short sometimes to encourage collisions + : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); + } while (len < minimum); + return test::RandomKey(rnd, len); +} - // Test level0_stop_writes_trigger. - // Clean up memtable and L0. Block compaction threads. If continue to write - // and flush memtables. We should see put stop after 8 memtable flushes - // since level0_stop_writes_trigger = 8 - dbfull()->TEST_FlushMemTable(true); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - // Block compaction - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - sleeping_task_low.WaitUntilSleeping(); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); +static bool CompareIterators(int step, DB* model, DB* db, + const Snapshot* model_snap, + const Snapshot* db_snap) { + ReadOptions options; + options.snapshot = model_snap; + Iterator* miter = model->NewIterator(options); + options.snapshot = db_snap; + Iterator* dbiter = db->NewIterator(options); + bool ok = true; int count = 0; - Random rnd(301); - WriteOptions wo; - while (count < 64) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); - dbfull()->TEST_FlushMemTable(true); + for (miter->SeekToFirst(), dbiter->SeekToFirst(); + ok && miter->Valid() && dbiter->Valid(); miter->Next(), dbiter->Next()) { count++; - if (dbfull()->TEST_write_controler().IsStopped()) { - sleeping_task_low.WakeUp(); + if (miter->key().compare(dbiter->key()) != 0) { + fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", step, + EscapeString(miter->key()).c_str(), + EscapeString(dbiter->key()).c_str()); + ok = false; break; } - } - // Stop trigger = 8 - ASSERT_EQ(count, 8); - // Unblock - sleeping_task_low.WaitUntilDone(); - - // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0. - // Block compaction thread again. Perform the put and memtable flushes - // until we see the stop after 6 memtable flushes. - ASSERT_OK(dbfull()->SetOptions({ - {"level0_stop_writes_trigger", "6"} - })); - dbfull()->TEST_FlushMemTable(true); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - // Block compaction again - sleeping_task_low.Reset(); - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - sleeping_task_low.WaitUntilSleeping(); - count = 0; - while (count < 64) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); - dbfull()->TEST_FlushMemTable(true); - count++; - if (dbfull()->TEST_write_controler().IsStopped()) { - sleeping_task_low.WakeUp(); - break; + if (miter->value().compare(dbiter->value()) != 0) { + fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", + step, EscapeString(miter->key()).c_str(), + EscapeString(miter->value()).c_str(), + EscapeString(miter->value()).c_str()); + ok = false; } } - ASSERT_EQ(count, 6); - // Unblock - sleeping_task_low.WaitUntilDone(); - - // Test disable_auto_compactions - // Compaction thread is unblocked but auto compaction is disabled. Write - // 4 L0 files and compaction should be triggered. If auto compaction is - // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of - // L0 files do not change after the call. - ASSERT_OK(dbfull()->SetOptions({ - {"disable_auto_compactions", "true"} - })); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - for (int i = 0; i < 4; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); - // Wait for compaction so that put won't stop - dbfull()->TEST_FlushMemTable(true); + if (ok) { + if (miter->Valid() != dbiter->Valid()) { + fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", + step, miter->Valid(), dbiter->Valid()); + ok = false; + } } - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(NumTableFilesAtLevel(0), 4); + delete miter; + delete dbiter; + return ok; +} - // Enable auto compaction and perform the same test, # of L0 files should be - // reduced after compaction. - ASSERT_OK(dbfull()->SetOptions({ - {"disable_auto_compactions", "false"} - })); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); +class DBTestRandomized : public DBTest, + public ::testing::WithParamInterface { + public: + virtual void SetUp() override { option_config_ = GetParam(); } - for (int i = 0; i < 4; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); - // Wait for compaction so that put won't stop - dbfull()->TEST_FlushMemTable(true); + static std::vector GenerateOptionConfigs() { + std::vector option_configs; + // skip cuckoo hash as it does not support snapshot. + for (int option_config = kDefault; option_config < kEnd; ++option_config) { + if (!ShouldSkipOptions(option_config, kSkipDeletesFilterFirst | + kSkipNoSeekToLast | + kSkipHashCuckoo)) { + option_configs.push_back(option_config); + } + } + option_configs.push_back(kBlockBasedTableWithIndexRestartInterval); + return option_configs; } - dbfull()->TEST_WaitForCompact(); - ASSERT_LT(NumTableFilesAtLevel(0), 4); -} -#endif // ROCKSDB_LITE +}; -TEST_F(DBTest, FileCreationRandomFailure) { - Options options; - options.env = env_; - options.create_if_missing = true; - options.write_buffer_size = 100000; // Small write buffer - options.target_file_size_base = 200000; - options.max_bytes_for_level_base = 1000000; - options.max_bytes_for_level_multiplier = 2; +INSTANTIATE_TEST_CASE_P( + DBTestRandomized, DBTestRandomized, + ::testing::ValuesIn(DBTestRandomized::GenerateOptionConfigs())); +TEST_P(DBTestRandomized, Randomized) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + Options options = CurrentOptions(options_override); DestroyAndReopen(options); - Random rnd(301); - const int kCDTKeysPerBuffer = 4; - const int kTestSize = kCDTKeysPerBuffer * 4096; - const int kTotalIteration = 100; - // the second half of the test involves in random failure - // of file creation. - const int kRandomFailureTest = kTotalIteration / 2; - std::vector values; - for (int i = 0; i < kTestSize; ++i) { - values.push_back("NOT_FOUND"); - } - for (int j = 0; j < kTotalIteration; ++j) { - if (j == kRandomFailureTest) { - env_->non_writeable_rate_.store(90); - } - for (int k = 0; k < kTestSize; ++k) { - // here we expect some of the Put fails. - std::string value = RandomString(&rnd, 100); - Status s = Put(Key(k), Slice(value)); - if (s.ok()) { - // update the latest successful put - values[k] = value; + Random rnd(test::RandomSeed() + GetParam()); + ModelDB model(options); + const int N = 10000; + const Snapshot* model_snap = nullptr; + const Snapshot* db_snap = nullptr; + std::string k, v; + for (int step = 0; step < N; step++) { + // TODO(sanjay): Test Get() works + int p = rnd.Uniform(100); + int minimum = 0; + if (option_config_ == kHashSkipList || option_config_ == kHashLinkList || + option_config_ == kHashCuckoo || + option_config_ == kPlainTableFirstBytePrefix || + option_config_ == kBlockBasedTableWithWholeKeyHashIndex || + option_config_ == kBlockBasedTableWithPrefixHashIndex) { + minimum = 1; + } + if (p < 45) { // Put + k = RandomKey(&rnd, minimum); + v = RandomString(&rnd, + rnd.OneIn(20) ? 100 + rnd.Uniform(100) : rnd.Uniform(8)); + ASSERT_OK(model.Put(WriteOptions(), k, v)); + ASSERT_OK(db_->Put(WriteOptions(), k, v)); + } else if (p < 90) { // Delete + k = RandomKey(&rnd, minimum); + ASSERT_OK(model.Delete(WriteOptions(), k)); + ASSERT_OK(db_->Delete(WriteOptions(), k)); + } else { // Multi-element batch + WriteBatch b; + const int num = rnd.Uniform(8); + for (int i = 0; i < num; i++) { + if (i == 0 || !rnd.OneIn(10)) { + k = RandomKey(&rnd, minimum); + } else { + // Periodically re-use the same key from the previous iter, so + // we have multiple entries in the write batch for the same key + } + if (rnd.OneIn(2)) { + v = RandomString(&rnd, rnd.Uniform(10)); + b.Put(k, v); + } else { + b.Delete(k); + } } - // But everything before we simulate the failure-test should succeed. - if (j < kRandomFailureTest) { - ASSERT_OK(s); + ASSERT_OK(model.Write(WriteOptions(), &b)); + ASSERT_OK(db_->Write(WriteOptions(), &b)); + } + + if ((step % 100) == 0) { + // For DB instances that use the hash index + block-based table, the + // iterator will be invalid right when seeking a non-existent key, right + // than return a key that is close to it. + if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex && + option_config_ != kBlockBasedTableWithPrefixHashIndex) { + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); } + + // Save a snapshot from each DB this time that we'll use next + // time we compare things, to make sure the current state is + // preserved with the snapshot + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); + + Reopen(options); + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + + model_snap = model.GetSnapshot(); + db_snap = db_->GetSnapshot(); } } + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); +} + +TEST_F(DBTest, MultiGetSimple) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "k1", "v1")); + ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Put(1, "k3", "v3")); + ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Delete(1, "k4")); + ASSERT_OK(Put(1, "k5", "v5")); + ASSERT_OK(Delete(1, "no_key")); + + std::vector keys({"k1", "k2", "k3", "k4", "k5", "no_key"}); + + std::vector values(20, "Temporary data to be overwritten"); + std::vector cfs(keys.size(), handles_[1]); + + std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(values[0], "v1"); + ASSERT_EQ(values[1], "v2"); + ASSERT_EQ(values[2], "v3"); + ASSERT_EQ(values[4], "v5"); - // If rocksdb does not do the correct job, internal assert will fail here. - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(s[0]); + ASSERT_OK(s[1]); + ASSERT_OK(s[2]); + ASSERT_TRUE(s[3].IsNotFound()); + ASSERT_OK(s[4]); + ASSERT_TRUE(s[5].IsNotFound()); + } while (ChangeCompactOptions()); +} - // verify we have the latest successful update - for (int k = 0; k < kTestSize; ++k) { - auto v = Get(Key(k)); - ASSERT_EQ(v, values[k]); - } +TEST_F(DBTest, MultiGetEmpty) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + // Empty Key Set + std::vector keys; + std::vector values; + std::vector cfs; + std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(s.size(), 0U); - // reopen and reverify we have the latest successful update - env_->non_writeable_rate_.store(0); - Reopen(options); - for (int k = 0; k < kTestSize; ++k) { - auto v = Get(Key(k)); - ASSERT_EQ(v, values[k]); - } + // Empty Database, Empty Key Set + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(s.size(), 0U); + + // Empty Database, Search for Keys + keys.resize(2); + keys[0] = "a"; + keys[1] = "b"; + cfs.push_back(handles_[0]); + cfs.push_back(handles_[1]); + s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(static_cast(s.size()), 2); + ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound()); + } while (ChangeCompactOptions()); } -#ifndef ROCKSDB_LITE -TEST_F(DBTest, DynamicMiscOptions) { - // Test max_sequential_skip_in_iterations - Options options; - options.env = env_; - options.create_if_missing = true; - options.max_sequential_skip_in_iterations = 16; - options.compression = kNoCompression; - options.statistics = rocksdb::CreateDBStatistics(); - DestroyAndReopen(options); +TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - auto assert_reseek_count = [this, &options](int key_start, int num_reseek) { - int key0 = key_start; - int key1 = key_start + 1; - int key2 = key_start + 2; - Random rnd(301); - ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8))); - for (int i = 0; i < 10; ++i) { - ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8))); - } - ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8))); - std::unique_ptr iter(db_->NewIterator(ReadOptions())); - iter->Seek(Key(key1)); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Key(key1)), 0); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Key(key2)), 0); - ASSERT_EQ(num_reseek, - TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION)); - }; - // No reseek - assert_reseek_count(100, 0); + Reopen(options); + ASSERT_OK(Put("k1", "v1")); + Flush(); + ASSERT_OK(Put("k2", "v2")); - ASSERT_OK(dbfull()->SetOptions({ - {"max_sequential_skip_in_iterations", "4"} - })); - // Clear memtable and make new option effective - dbfull()->TEST_FlushMemTable(true); - // Trigger reseek - assert_reseek_count(200, 1); + // Reopen it without prefix extractor, make sure everything still works. + // RocksDB should just fall back to the binary index. + table_options.index_type = BlockBasedTableOptions::kBinarySearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(); - ASSERT_OK(dbfull()->SetOptions({ - {"max_sequential_skip_in_iterations", "16"} - })); - // Clear memtable and make new option effective - dbfull()->TEST_FlushMemTable(true); - // No reseek - assert_reseek_count(300, 1); + Reopen(options); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); } -#endif // ROCKSDB_LITE -TEST_F(DBTest, DontDeletePendingOutputs) { - Options options; - options.env = env_; - options.create_if_missing = true; - DestroyAndReopen(options); +TEST_F(DBTest, ChecksumTest) { + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); - // Every time we write to a table file, call FOF/POF with full DB scan. This - // will make sure our pending_outputs_ protection work correctly - std::function purge_obsolete_files_function = [&]() { - JobContext job_context(0); - dbfull()->TEST_LockMutex(); - dbfull()->FindObsoleteFiles(&job_context, true /*force*/); - dbfull()->TEST_UnlockMutex(); - dbfull()->PurgeObsoleteFiles(job_context); - job_context.Clean(); - }; + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Flush()); // table with crc checksum - env_->table_write_callback_ = &purge_obsolete_files_function; + table_options.checksum = kxxHash; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_OK(Put("e", "f")); + ASSERT_OK(Put("g", "h")); + ASSERT_OK(Flush()); // table with xxhash checksum - for (int i = 0; i < 2; ++i) { - ASSERT_OK(Put("a", "begin")); - ASSERT_OK(Put("z", "end")); - ASSERT_OK(Flush()); - } + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_EQ("b", Get("a")); + ASSERT_EQ("d", Get("c")); + ASSERT_EQ("f", Get("e")); + ASSERT_EQ("h", Get("g")); - // If pending output guard does not work correctly, PurgeObsoleteFiles() will - // delete the file that Compaction is trying to create, causing this: error - // db/db_test.cc:975: IO error: - // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory - Compact("a", "b"); + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_EQ("b", Get("a")); + ASSERT_EQ("d", Get("c")); + ASSERT_EQ("f", Get("e")); + ASSERT_EQ("h", Get("g")); } #ifndef ROCKSDB_LITE -TEST_F(DBTest, DontDeleteMovedFile) { - // This test triggers move compaction and verifies that the file is not - // deleted when it's part of move compaction - Options options = CurrentOptions(); - options.env = env_; - options.create_if_missing = true; - options.max_bytes_for_level_base = 1024 * 1024; // 1 MB - options.level0_file_num_compaction_trigger = - 2; // trigger compaction when we have 2 files - DestroyAndReopen(options); +TEST_P(DBTestWithParam, FIFOCompactionTest) { + for (int iter = 0; iter < 2; ++iter) { + // first iteration -- auto compaction + // second iteration -- manual compaction + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 100 << 10; // 100KB + options.arena_block_size = 4096; + options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB + options.compression = kNoCompression; + options.create_if_missing = true; + options.max_subcompactions = max_subcompactions_; + if (iter == 1) { + options.disable_auto_compactions = true; + } + options = CurrentOptions(options); + DestroyAndReopen(options); - Random rnd(301); - // Create two 1MB sst files - for (int i = 0; i < 2; ++i) { - // Create 1MB sst file - for (int j = 0; j < 100; ++j) { - ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + Random rnd(301); + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 110; ++j) { + ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 980))); + } + // flush should happen here + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + if (iter == 0) { + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } else { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + // only 5 files should survive + ASSERT_EQ(NumTableFilesAtLevel(0), 5); + for (int i = 0; i < 50; ++i) { + // these keys should be deleted in previous compaction + ASSERT_EQ("NOT_FOUND", Get(ToString(i))); } - ASSERT_OK(Flush()); } - // this should execute both L0->L1 and L1->(move)->L2 compactions - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("0,0,1", FilesPerLevel(0)); +} +#endif // ROCKSDB_LITE - // If the moved file is actually deleted (the move-safeguard in - // ~Version::Version() is not there), we get this failure: - // Corruption: Can't access /000009.sst - Reopen(options); +// verify that we correctly deprecated timeout_hint_us +TEST_F(DBTest, SimpleWriteTimeoutTest) { + WriteOptions write_opt; + write_opt.timeout_hint_us = 0; + ASSERT_OK(Put(Key(1), Key(1) + std::string(100, 'v'), write_opt)); + write_opt.timeout_hint_us = 10; + ASSERT_NOK(Put(Key(1), Key(1) + std::string(100, 'v'), write_opt)); } -TEST_F(DBTest, OptimizeFiltersForHits) { +#ifndef ROCKSDB_LITE +/* + * This test is not reliable enough as it heavily depends on disk behavior. + */ +TEST_F(DBTest, RateLimitingTest) { Options options = CurrentOptions(); - options.write_buffer_size = 64 * 1024; - options.arena_block_size = 4 * 1024; - options.target_file_size_base = 64 * 1024; + options.write_buffer_size = 1 << 20; // 1MB options.level0_file_num_compaction_trigger = 2; - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 4; - options.max_bytes_for_level_base = 256 * 1024; - options.max_write_buffer_number = 2; - options.max_background_compactions = 8; - options.max_background_flushes = 8; + options.target_file_size_base = 1 << 20; // 1MB + options.max_bytes_for_level_base = 4 << 20; // 4MB + options.max_bytes_for_level_multiplier = 4; options.compression = kNoCompression; - options.compaction_style = kCompactionStyleLevel; - options.level_compaction_dynamic_level_bytes = true; - BlockBasedTableOptions bbto; - bbto.cache_index_and_filter_blocks = true; - bbto.filter_policy.reset(NewBloomFilterPolicy(10, true)); - bbto.whole_key_filtering = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - options.optimize_filters_for_hits = true; - options.statistics = rocksdb::CreateDBStatistics(); - CreateAndReopenWithCF({"mypikachu"}, options); + options.create_if_missing = true; + options.env = env_; + options.IncreaseParallelism(4); + DestroyAndReopen(options); - int numkeys = 200000; + WriteOptions wo; + wo.disableWAL = true; - // Generate randomly shuffled keys, so the updates are almost - // random. - std::vector keys; - keys.reserve(numkeys); - for (int i = 0; i < numkeys; i += 2) { - keys.push_back(i); + // # no rate limiting + Random rnd(301); + uint64_t start = env_->NowMicros(); + // Write ~96M data + for (int64_t i = 0; i < (96 << 10); ++i) { + ASSERT_OK( + Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo)); } - std::random_shuffle(std::begin(keys), std::end(keys)); + uint64_t elapsed = env_->NowMicros() - start; + double raw_rate = env_->bytes_written_ * 1000000.0 / elapsed; + Close(); + + // # rate limiting with 0.7 x threshold + options.rate_limiter.reset( + NewGenericRateLimiter(static_cast(0.7 * raw_rate))); + env_->bytes_written_ = 0; + DestroyAndReopen(options); + + start = env_->NowMicros(); + // Write ~96M data + for (int64_t i = 0; i < (96 << 10); ++i) { + ASSERT_OK( + Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo)); + } + elapsed = env_->NowMicros() - start; + Close(); + ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); + double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; + fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio); + ASSERT_TRUE(ratio < 0.8); + + // # rate limiting with half of the raw_rate + options.rate_limiter.reset( + NewGenericRateLimiter(static_cast(raw_rate / 2))); + env_->bytes_written_ = 0; + DestroyAndReopen(options); - int num_inserted = 0; - for (int key : keys) { - ASSERT_OK(Put(1, Key(key), "val")); - if (++num_inserted % 1000 == 0) { - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - } + start = env_->NowMicros(); + // Write ~96M data + for (int64_t i = 0; i < (96 << 10); ++i) { + ASSERT_OK( + Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo)); } - ASSERT_OK(Put(1, Key(0), "val")); - ASSERT_OK(Put(1, Key(numkeys), "val")); - ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + elapsed = env_->NowMicros() - start; + Close(); + ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); + ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; + fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio); + ASSERT_LT(ratio, 0.6); +} - if (NumTableFilesAtLevel(0, 1) == 0) { - // No Level 0 file. Create one. - ASSERT_OK(Put(1, Key(0), "val")); - ASSERT_OK(Put(1, Key(numkeys), "val")); - ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); - } +TEST_F(DBTest, TableOptionsSanitizeTest) { + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false); - for (int i = 1; i < numkeys; i += 2) { - ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND"); - } + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewNoopTransform()); + Destroy(options); + ASSERT_TRUE(!TryReopen(options).IsNotSupported()); - ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); - ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); - ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + // Test for check of prefix_extractor when hash index is used for + // block-based table + BlockBasedTableOptions to; + to.index_type = BlockBasedTableOptions::kHashSearch; + options = CurrentOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(to)); + ASSERT_TRUE(TryReopen(options).IsInvalidArgument()); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + ASSERT_OK(TryReopen(options)); +} - // Now we have three sorted run, L0, L5 and L6 with most files in L6 have - // no bloom filter. Most keys be checked bloom filters twice. - ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2); - ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2); +TEST_F(DBTest, MmapAndBufferOptions) { + Options options = CurrentOptions(); - for (int i = 0; i < numkeys; i += 2) { - ASSERT_EQ(Get(1, Key(i)), "val"); - } + // If allow_mmap_reads is on allow_os_buffer must also be on + // On Windows you can have either memory mapped file or a file + // with unbuffered access. +#ifndef OS_WIN + options.allow_os_buffer = false; +#endif + options.allow_mmap_reads = true; + ASSERT_NOK(TryReopen(options)); - // Part 2 (read path): rewrite last level with blooms, then verify they get - // cached only if !optimize_filters_for_hits - options.disable_auto_compactions = true; - options.num_levels = 9; - options.optimize_filters_for_hits = false; - options.statistics = CreateDBStatistics(); - bbto.block_cache.reset(); - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - - ReopenWithColumnFamilies({"default", "mypikachu"}, options); - MoveFilesToLevel(7 /* level */, 1 /* column family index */); - - std::string value = Get(1, Key(0)); - uint64_t prev_cache_filter_hits = - TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); - value = Get(1, Key(0)); - ASSERT_EQ(prev_cache_filter_hits + 1, - TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); - - // Now that we know the filter blocks exist in the last level files, see if - // filter caching is skipped for this optimization - options.optimize_filters_for_hits = true; - options.statistics = CreateDBStatistics(); - bbto.block_cache.reset(); - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - - ReopenWithColumnFamilies({"default", "mypikachu"}, options); - - value = Get(1, Key(0)); - ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); - ASSERT_EQ(2 /* index and data block */, - TestGetTickerCount(options, BLOCK_CACHE_ADD)); - - // Check filter block ignored for files preloaded during DB::Open() - options.max_open_files = -1; - options.statistics = CreateDBStatistics(); - bbto.block_cache.reset(); - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - - ReopenWithColumnFamilies({"default", "mypikachu"}, options); - - uint64_t prev_cache_filter_misses = - TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); - prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); - Get(1, Key(0)); - ASSERT_EQ(prev_cache_filter_misses, - TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(prev_cache_filter_hits, - TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); - - // Check filter block ignored for file trivially-moved to bottom level - bbto.block_cache.reset(); - options.max_open_files = 100; // setting > -1 makes it not preload all files - options.statistics = CreateDBStatistics(); - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - - ReopenWithColumnFamilies({"default", "mypikachu"}, options); - - ASSERT_OK(Put(1, Key(numkeys + 1), "val")); - ASSERT_OK(Flush(1)); + // All other combinations are acceptable + options.allow_os_buffer = true; + ASSERT_OK(TryReopen(options)); - int32_t trivial_move = 0; - int32_t non_trivial_move = 0; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::BackgroundCompaction:TrivialMove", - [&](void* arg) { trivial_move++; }); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::BackgroundCompaction:NonTrivial", - [&](void* arg) { non_trivial_move++; }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + options.allow_os_buffer = false; + options.allow_mmap_reads = false; + ASSERT_OK(TryReopen(options)); - CompactRangeOptions compact_options; - compact_options.bottommost_level_compaction = - BottommostLevelCompaction::kSkip; - compact_options.change_level = true; - compact_options.target_level = 7; - db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); - - ASSERT_EQ(trivial_move, 1); - ASSERT_EQ(non_trivial_move, 0); - - prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); - prev_cache_filter_misses = - TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); - value = Get(1, Key(numkeys + 1)); - ASSERT_EQ(prev_cache_filter_hits, - TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); - ASSERT_EQ(prev_cache_filter_misses, - TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - - // Check filter block not cached for iterator - bbto.block_cache.reset(); - options.statistics = CreateDBStatistics(); - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - - ReopenWithColumnFamilies({"default", "mypikachu"}, options); - - std::unique_ptr iter(db_->NewIterator(ReadOptions(), handles_[1])); - iter->SeekToFirst(); - ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); - ASSERT_EQ(2 /* index and data block */, - TestGetTickerCount(options, BLOCK_CACHE_ADD)); + options.allow_os_buffer = true; + ASSERT_OK(TryReopen(options)); } -#endif // ROCKSDB_LITE -TEST_F(DBTest, L0L1L2AndUpHitCounter) { +TEST_F(DBTest, ConcurrentMemtableNotSupported) { Options options = CurrentOptions(); - options.write_buffer_size = 32 * 1024; - options.target_file_size_base = 32 * 1024; - options.level0_file_num_compaction_trigger = 2; - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 4; - options.max_bytes_for_level_base = 64 * 1024; - options.max_write_buffer_number = 2; - options.max_background_compactions = 8; - options.max_background_flushes = 8; - options.statistics = rocksdb::CreateDBStatistics(); - CreateAndReopenWithCF({"mypikachu"}, options); + options.allow_concurrent_memtable_write = true; + options.soft_pending_compaction_bytes_limit = 0; + options.hard_pending_compaction_bytes_limit = 100; + options.create_if_missing = true; - int numkeys = 20000; - for (int i = 0; i < numkeys; i++) { - ASSERT_OK(Put(1, Key(i), "val")); - } - ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); - ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); - ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + DestroyDB(dbname_, options); + options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 4)); + ASSERT_NOK(TryReopen(options)); - ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + options.memtable_factory.reset(new SkipListFactory); + ASSERT_OK(TryReopen(options)); - for (int i = 0; i < numkeys; i++) { - ASSERT_EQ(Get(1, Key(i)), "val"); - } + ColumnFamilyOptions cf_options(options); + cf_options.memtable_factory.reset( + NewHashLinkListRepFactory(4, 0, 3, true, 4)); + ColumnFamilyHandle* handle; + ASSERT_NOK(db_->CreateColumnFamily(cf_options, "name", &handle)); +} - ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100); - ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100); - ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100); +#endif // ROCKSDB_LITE - ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) + - TestGetTickerCount(options, GET_HIT_L1) + - TestGetTickerCount(options, GET_HIT_L2_AND_UP)); -} +TEST_F(DBTest, SanitizeNumThreads) { + for (int attempt = 0; attempt < 2; attempt++) { + const size_t kTotalTasks = 8; + test::SleepingBackgroundTask sleeping_tasks[kTotalTasks]; -TEST_F(DBTest, EncodeDecompressedBlockSizeTest) { - // iter 0 -- zlib - // iter 1 -- bzip2 - // iter 2 -- lz4 - // iter 3 -- lz4HC - CompressionType compressions[] = {kZlibCompression, kBZip2Compression, - kLZ4Compression, kLZ4HCCompression}; - for (int iter = 0; iter < 4; ++iter) { - if (!CompressionTypeSupported(compressions[iter])) { - continue; + Options options = CurrentOptions(); + if (attempt == 0) { + options.max_background_compactions = 3; + options.max_background_flushes = 2; } - // first_table_version 1 -- generate with table_version == 1, read with - // table_version == 2 - // first_table_version 2 -- generate with table_version == 2, read with - // table_version == 1 - for (int first_table_version = 1; first_table_version <= 2; - ++first_table_version) { - BlockBasedTableOptions table_options; - table_options.format_version = first_table_version; - table_options.filter_policy.reset(NewBloomFilterPolicy(10)); - Options options = CurrentOptions(); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.create_if_missing = true; - options.compression = compressions[iter]; - DestroyAndReopen(options); + options.create_if_missing = true; + DestroyAndReopen(options); - int kNumKeysWritten = 100000; + for (size_t i = 0; i < kTotalTasks; i++) { + // Insert 5 tasks to low priority queue and 5 tasks to high priority queue + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], + (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH); + } - Random rnd(301); - for (int i = 0; i < kNumKeysWritten; ++i) { - // compressible string - ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a'))); - } + // Wait 100 milliseconds for they are scheduled. + env_->SleepForMicroseconds(100000); - table_options.format_version = first_table_version == 1 ? 2 : 1; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(options); - for (int i = 0; i < kNumKeysWritten; ++i) { - auto r = Get(Key(i)); - ASSERT_EQ(r.substr(128), std::string(128, 'a')); - } + // pool size 3, total task 4. Queue size should be 1. + ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW)); + // pool size 2, total task 4. Queue size should be 2. + ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); } + + ASSERT_OK(Put("abc", "def")); + ASSERT_EQ("def", Get("abc")); + Flush(); + ASSERT_EQ("def", Get("abc")); } } -TEST_F(DBTest, MutexWaitStatsDisabledByDefault) { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); - CreateAndReopenWithCF({"pikachu"}, options); - const uint64_t kMutexWaitDelay = 100; - ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, - kMutexWaitDelay); - ASSERT_OK(Put("hello", "rocksdb")); - ASSERT_EQ(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), 0); - ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0); -} +TEST_F(DBTest, WriteSingleThreadEntry) { + std::vector threads; + dbfull()->TEST_LockMutex(); + auto w = dbfull()->TEST_BeginWrite(); + threads.emplace_back([&] { Put("a", "b"); }); + env_->SleepForMicroseconds(10000); + threads.emplace_back([&] { Flush(); }); + env_->SleepForMicroseconds(10000); + dbfull()->TEST_UnlockMutex(); + dbfull()->TEST_LockMutex(); + dbfull()->TEST_EndWrite(w); + dbfull()->TEST_UnlockMutex(); -TEST_F(DBTest, MutexWaitStats) { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); - options.statistics->stats_level_ = StatsLevel::kAll; - CreateAndReopenWithCF({"pikachu"}, options); - const uint64_t kMutexWaitDelay = 100; - ThreadStatusUtil::TEST_SetStateDelay( - ThreadStatus::STATE_MUTEX_WAIT, kMutexWaitDelay); - ASSERT_OK(Put("hello", "rocksdb")); - ASSERT_GE(TestGetTickerCount( - options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay); - ThreadStatusUtil::TEST_SetStateDelay( - ThreadStatus::STATE_MUTEX_WAIT, 0); + for (auto& t : threads) { + t.join(); + } } -#ifndef ROCKSDB_LITE -// This reproduces a bug where we don't delete a file because when it was -// supposed to be deleted, it was blocked by pending_outputs -// Consider: -// 1. current file_number is 13 -// 2. compaction (1) starts, blocks deletion of all files starting with 13 -// (pending outputs) -// 3. file 13 is created by compaction (2) -// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file -// 13 has no references, it is put into VersionSet::obsolete_files_ -// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13 -// is deleted from obsolete_files_ set. -// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by -// pending outputs since compaction (1) is still running. It is not deleted and -// it is not present in obsolete_files_ anymore. Therefore, we never delete it. -TEST_F(DBTest, DeleteObsoleteFilesPendingOutputs) { - Options options = CurrentOptions(); - options.env = env_; - options.write_buffer_size = 2 * 1024 * 1024; // 2 MB - options.max_bytes_for_level_base = 1024 * 1024; // 1 MB - options.level0_file_num_compaction_trigger = - 2; // trigger compaction when we have 2 files - options.max_background_flushes = 2; - options.max_background_compactions = 2; - - OnFileDeletionListener* listener = new OnFileDeletionListener(); - options.listeners.emplace_back(listener); - - Reopen(options); - - Random rnd(301); - // Create two 1MB sst files - for (int i = 0; i < 2; ++i) { - // Create 1MB sst file - for (int j = 0; j < 100; ++j) { - ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); - } - ASSERT_OK(Flush()); - } - // this should execute both L0->L1 and L1->(move)->L2 compactions - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("0,0,1", FilesPerLevel(0)); +TEST_F(DBTest, DisableDataSyncTest) { + env_->sync_counter_.store(0); + // iter 0 -- no sync + // iter 1 -- sync + for (int iter = 0; iter < 2; ++iter) { + Options options = CurrentOptions(); + options.disableDataSync = iter == 0; + options.create_if_missing = true; + options.num_levels = 10; + options.env = env_; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); - test::SleepingBackgroundTask blocking_thread; - port::Mutex mutex_; - bool already_blocked(false); + MakeTables(10, "a", "z"); + Compact("a", "z"); - // block the flush - std::function block_first_time = [&]() { - bool blocking = false; - { - MutexLock l(&mutex_); - if (!already_blocked) { - blocking = true; - already_blocked = true; - } - } - if (blocking) { - blocking_thread.DoSleep(); + if (iter == 0) { + ASSERT_EQ(env_->sync_counter_.load(), 0); + } else { + ASSERT_GT(env_->sync_counter_.load(), 0); } - }; - env_->table_write_callback_ = &block_first_time; - // Create 1MB sst file - for (int j = 0; j < 256; ++j) { - ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024))); + Destroy(options); } - // this should trigger a flush, which is blocked with block_first_time - // pending_file is protecting all the files created after +} - ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr)); +#ifndef ROCKSDB_LITE +TEST_F(DBTest, DynamicMemtableOptions) { + const uint64_t k64KB = 1 << 16; + const uint64_t k128KB = 1 << 17; + const uint64_t k5KB = 5 * 1024; + const int kNumPutsBeforeWaitForFlush = 64; + Options options; + options.env = env_; + options.create_if_missing = true; + options.compression = kNoCompression; + options.max_background_compactions = 1; + options.write_buffer_size = k64KB; + options.arena_block_size = 16 * 1024; + options.max_write_buffer_number = 2; + // Don't trigger compact/slowdown/stop + options.level0_file_num_compaction_trigger = 1024; + options.level0_slowdown_writes_trigger = 1024; + options.level0_stop_writes_trigger = 1024; + DestroyAndReopen(options); - ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); - std::vector metadata; - db_->GetLiveFilesMetaData(&metadata); - ASSERT_EQ(metadata.size(), 1U); - auto file_on_L2 = metadata[0].name; - listener->SetExpectedFileName(dbname_ + file_on_L2); + auto gen_l0_kb = [this, kNumPutsBeforeWaitForFlush](int size) { + Random rnd(301); + for (int i = 0; i < size; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); - ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, - true /* disallow trivial move */)); - ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0)); + // The following condition prevents a race condition between flush jobs + // acquiring work and this thread filling up multiple memtables. Without + // this, the flush might produce less files than expected because + // multiple memtables are flushed into a single L0 file. This race + // condition affects assertion (A). + if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) { + dbfull()->TEST_WaitForFlushMemTable(); + } + } + dbfull()->TEST_WaitForFlushMemTable(); + }; - // finish the flush! - blocking_thread.WakeUp(); - blocking_thread.WaitUntilDone(); - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ("1,0,0,0,1", FilesPerLevel(0)); + // Test write_buffer_size + gen_l0_kb(64); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_LT(SizeAtLevel(0), k64KB + k5KB); + ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2); - metadata.clear(); - db_->GetLiveFilesMetaData(&metadata); - ASSERT_EQ(metadata.size(), 2U); + // Clean up L0 + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); - // This file should have been deleted during last compaction - ASSERT_EQ(Status::NotFound(), env_->FileExists(dbname_ + file_on_L2)); - listener->VerifyMatchedCount(1); -} -#endif // ROCKSDB_LITE + // Increase buffer size + ASSERT_OK(dbfull()->SetOptions({ + {"write_buffer_size", "131072"}, + })); -TEST_F(DBTest, CloseSpeedup) { - Options options = CurrentOptions(); - options.compaction_style = kCompactionStyleLevel; - options.write_buffer_size = 110 << 10; // 110KB - options.arena_block_size = 4 << 10; - options.level0_file_num_compaction_trigger = 2; - options.num_levels = 4; - options.max_bytes_for_level_base = 400 * 1024; - options.max_write_buffer_number = 16; + // The existing memtable is still 64KB in size, after it becomes immutable, + // the next memtable will be 128KB in size. Write 256KB total, we should + // have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data + gen_l0_kb(256); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); // (A) + ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB); + ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB); - // Block background threads + // Test max_write_buffer_number + // Block compaction thread, which will also block the flushes because + // max_background_flushes == 0, so flushes are getting executed by the + // compaction thread env_->SetBackgroundThreads(1, Env::LOW); - env_->SetBackgroundThreads(1, Env::HIGH); test::SleepingBackgroundTask sleeping_task_low; env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, Env::Priority::LOW); - test::SleepingBackgroundTask sleeping_task_high; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_high, Env::Priority::HIGH); - - std::vector filenames; - env_->GetChildren(dbname_, &filenames); - // Delete archival files. - for (size_t i = 0; i < filenames.size(); ++i) { - env_->DeleteFile(dbname_ + "/" + filenames[i]); - } - env_->DeleteDir(dbname_); + // Start from scratch and disable compaction/flush. Flush can only happen + // during compaction but trigger is pretty high + options.max_background_flushes = 0; + options.disable_auto_compactions = true; DestroyAndReopen(options); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - env_->SetBackgroundThreads(1, Env::LOW); - env_->SetBackgroundThreads(1, Env::HIGH); + // Put until writes are stopped, bounded by 256 puts. We should see stop at + // ~128KB + int count = 0; Random rnd(301); - int key_idx = 0; - // First three 110KB files are not going to level 2 - // After that, (100K, 200K) - for (int num = 0; num < 5; num++) { - GenerateNewFile(&rnd, &key_idx, true); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", + [&](void* arg) { sleeping_task_low.WakeUp(); }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + while (!sleeping_task_low.WokenUp() && count < 256) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); + count++; } + ASSERT_GT(static_cast(count), 128 * 0.8); + ASSERT_LT(static_cast(count), 128 * 1.2); - ASSERT_EQ(0, GetSstFileCount(dbname_)); + sleeping_task_low.WaitUntilDone(); - Close(); - ASSERT_EQ(0, GetSstFileCount(dbname_)); + // Increase + ASSERT_OK(dbfull()->SetOptions({ + {"max_write_buffer_number", "8"}, + })); + // Clean up memtable and L0 + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - // Unblock background threads - sleeping_task_high.WakeUp(); - sleeping_task_high.WaitUntilDone(); - sleeping_task_low.WakeUp(); + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + count = 0; + while (!sleeping_task_low.WokenUp() && count < 1024) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); + count++; + } +// Windows fails this test. Will tune in the future and figure out +// approp number +#ifndef OS_WIN + ASSERT_GT(static_cast(count), 512 * 0.8); + ASSERT_LT(static_cast(count), 512 * 1.2); +#endif sleeping_task_low.WaitUntilDone(); - Destroy(options); -} + // Decrease + ASSERT_OK(dbfull()->SetOptions({ + {"max_write_buffer_number", "4"}, + })); + // Clean up memtable and L0 + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); -class DelayedMergeOperator : public MergeOperator { - private: - DBTest* db_test_; + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); - public: - explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {} - virtual bool FullMerge(const Slice& key, const Slice* existing_value, - const std::deque& operand_list, - std::string* new_value, - Logger* logger) const override { - db_test_->env_->addon_time_.fetch_add(1000); - *new_value = ""; - return true; + count = 0; + while (!sleeping_task_low.WokenUp() && count < 1024) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); + count++; } +// Windows fails this test. Will tune in the future and figure out +// approp number +#ifndef OS_WIN + ASSERT_GT(static_cast(count), 256 * 0.8); + ASSERT_LT(static_cast(count), 266 * 1.2); +#endif + sleeping_task_low.WaitUntilDone(); - virtual const char* Name() const override { return "DelayedMergeOperator"; } -}; + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif // ROCKSDB_LITE -TEST_F(DBTest, MergeTestTime) { - std::string one, two, three; - PutFixed64(&one, 1); - PutFixed64(&two, 2); - PutFixed64(&three, 3); +#if ROCKSDB_USING_THREAD_STATUS +namespace { +void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type, + int expected_count) { + int op_count = 0; + std::vector thread_list; + ASSERT_OK(env->GetThreadList(&thread_list)); + for (auto thread : thread_list) { + if (thread.operation_type == op_type) { + op_count++; + } + } + ASSERT_EQ(op_count, expected_count); +} +} // namespace - // Enable time profiling - SetPerfLevel(kEnableTime); - this->env_->addon_time_.store(0); - this->env_->time_elapse_only_sleep_ = true; - this->env_->no_sleep_ = true; - Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - options.merge_operator.reset(new DelayedMergeOperator(this)); - DestroyAndReopen(options); +TEST_F(DBTest, GetThreadStatus) { + Options options; + options.env = env_; + options.enable_thread_tracking = true; + TryReopen(options); - ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); - db_->Put(WriteOptions(), "foo", one); - ASSERT_OK(Flush()); - ASSERT_OK(db_->Merge(WriteOptions(), "foo", two)); - ASSERT_OK(Flush()); - ASSERT_OK(db_->Merge(WriteOptions(), "foo", three)); - ASSERT_OK(Flush()); + std::vector thread_list; + Status s = env_->GetThreadList(&thread_list); + + for (int i = 0; i < 2; ++i) { + // repeat the test with differet number of high / low priority threads + const int kTestCount = 3; + const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5}; + const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3}; + for (int test = 0; test < kTestCount; ++test) { + // Change the number of threads in high / low priority pool. + env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH); + env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW); + // Wait to ensure the all threads has been registered + env_->SleepForMicroseconds(100000); + s = env_->GetThreadList(&thread_list); + ASSERT_OK(s); + unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES]; + memset(thread_type_counts, 0, sizeof(thread_type_counts)); + for (auto thread : thread_list) { + ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES); + thread_type_counts[thread.thread_type]++; + } + // Verify the total number of threades + ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY] + + thread_type_counts[ThreadStatus::LOW_PRIORITY], + kHighPriCounts[test] + kLowPriCounts[test]); + // Verify the number of high-priority threads + ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY], + kHighPriCounts[test]); + // Verify the number of low-priority threads + ASSERT_EQ(thread_type_counts[ThreadStatus::LOW_PRIORITY], + kLowPriCounts[test]); + } + if (i == 0) { + // repeat the test with multiple column families + CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + true); + } + } + db_->DropColumnFamily(handles_[2]); + delete handles_[2]; + handles_.erase(handles_.begin() + 2); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + true); + Close(); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + true); +} + +TEST_F(DBTest, DisableThreadStatus) { + Options options; + options.env = env_; + options.enable_thread_tracking = false; + TryReopen(options); + CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); + // Verify non of the column family info exists + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + false); +} - ReadOptions opt; - opt.verify_checksums = true; - opt.snapshot = nullptr; - std::string result; - db_->Get(opt, "foo", &result); +TEST_F(DBTest, ThreadStatusFlush) { + Options options; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options.enable_thread_tracking = true; + options = CurrentOptions(options); - ASSERT_EQ(1000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"}, + {"DBTest::ThreadStatusFlush:2", + "FlushJob::LogAndNotifyTableFileCreation()"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - ReadOptions read_options; - std::unique_ptr iter(db_->NewIterator(read_options)); - int count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - ++count; - } + CreateAndReopenWithCF({"pikachu"}, options); + VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0); - ASSERT_EQ(1, count); - ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); -#if ROCKSDB_USING_THREAD_STATUS - ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0); -#endif // ROCKSDB_USING_THREAD_STATUS - this->env_->time_elapse_only_sleep_ = false; -} + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0); -#ifndef ROCKSDB_LITE -TEST_P(DBTestWithParam, MergeCompactionTimeTest) { - SetPerfLevel(kEnableTime); - Options options = CurrentOptions(); - options.compaction_filter_factory = std::make_shared(); - options.statistics = rocksdb::CreateDBStatistics(); - options.merge_operator.reset(new DelayedMergeOperator(this)); - options.compaction_style = kCompactionStyleUniversal; - options.max_subcompactions = max_subcompactions_; - DestroyAndReopen(options); + uint64_t num_running_flushes = 0; + db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes); + ASSERT_EQ(num_running_flushes, 0); - for (int i = 0; i < 1000; i++) { - ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST")); - ASSERT_OK(Flush()); - } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + Put(1, "k2", std::string(100000, 'y')); // Trigger flush - ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); + // The first sync point is to make sure there's one flush job + // running when we perform VerifyOperationCount(). + TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1"); + VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1); + db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes); + ASSERT_EQ(num_running_flushes, 1); + // This second sync point is to ensure the flush job will not + // be completed until we already perform VerifyOperationCount(). + TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2"); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } -TEST_P(DBTestWithParam, FilterCompactionTimeTest) { - Options options = CurrentOptions(); - options.compaction_filter_factory = - std::make_shared(this); - options.disable_auto_compactions = true; +TEST_P(DBTestWithParam, ThreadStatusSingleCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 100; + Options options; options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = options.target_file_size_base * 2; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.env = env_; + options.enable_thread_tracking = true; + const int kNumL0Files = 4; + options.level0_file_num_compaction_trigger = kNumL0Files; options.max_subcompactions = max_subcompactions_; - DestroyAndReopen(options); - // put some data - for (int table = 0; table < 4; ++table) { - for (int i = 0; i < 10 + table; ++i) { - Put(ToString(table * 100 + i), "val"); + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"}, + {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"}, + {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"}, + }); + for (int tests = 0; tests < 2; ++tests) { + DestroyAndReopen(options); + rocksdb::SyncPoint::GetInstance()->ClearTrace(); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + // The Put Phase. + for (int file = 0; file < kNumL0Files; ++file) { + for (int key = 0; key < kEntriesPerBuffer; ++key) { + ASSERT_OK(Put(ToString(key + file * kEntriesPerBuffer), + RandomString(&rnd, kTestValueSize))); + } + Flush(); } - Flush(); - } + // This makes sure a compaction won't be scheduled until + // we have done with the above Put Phase. + uint64_t num_running_compactions = 0; + db_->GetIntProperty(DB::Properties::kNumRunningCompactions, + &num_running_compactions); + ASSERT_EQ(num_running_compactions, 0); + TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0"); + ASSERT_GE(NumTableFilesAtLevel(0), + options.level0_file_num_compaction_trigger); - CompactRangeOptions cro; - cro.exclusive_manual_compaction = exclusive_manual_compaction_; - ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - ASSERT_EQ(0U, CountLiveFiles()); + // This makes sure at least one compaction is running. + TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1"); - Reopen(options); + if (options.enable_thread_tracking) { + // expecting one single L0 to L1 compaction + VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1); + } else { + // If thread tracking is not enabled, compaction count should be 0. + VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0); + } + db_->GetIntProperty(DB::Properties::kNumRunningCompactions, + &num_running_compactions); + ASSERT_EQ(num_running_compactions, 1); + // TODO(yhchiang): adding assert to verify each compaction stage. + TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2"); - Iterator* itr = db_->NewIterator(ReadOptions()); - itr->SeekToFirst(); - ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0); - delete itr; + // repeat the test with disabling thread tracking. + options.enable_thread_tracking = false; + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } } -#endif // ROCKSDB_LITE -TEST_F(DBTest, TestLogCleanup) { +TEST_P(DBTestWithParam, PreShutdownManualCompaction) { Options options = CurrentOptions(); - options.write_buffer_size = 64 * 1024; // very small - // only two memtables allowed ==> only two log files - options.max_write_buffer_number = 2; - Reopen(options); + options.max_background_flushes = 0; + options.max_subcompactions = max_subcompactions_; + CreateAndReopenWithCF({"pikachu"}, options); - for (int i = 0; i < 100000; ++i) { - Put(Key(i), "val"); - // only 2 memtables will be alive, so logs_to_free needs to always be below - // 2 - ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast(3)); - } -} + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + MakeTables(3, "p", "q", 1); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); -#ifndef ROCKSDB_LITE -TEST_F(DBTest, EmptyCompactedDB) { - Options options = CurrentOptions(); - options.max_open_files = -1; - Close(); - ASSERT_OK(ReadOnlyReopen(options)); - Status s = Put("new", "value"); - ASSERT_TRUE(s.IsNotSupported()); - Close(); -} -#endif // ROCKSDB_LITE + // Compaction range falls before files + Compact(1, "", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); -#ifndef ROCKSDB_LITE -TEST_F(DBTest, SuggestCompactRangeTest) { - class CompactionFilterFactoryGetContext : public CompactionFilterFactory { - public: - virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { - saved_context = context; - std::unique_ptr empty_filter; - return empty_filter; - } - const char* Name() const override { - return "CompactionFilterFactoryGetContext"; - } - static bool IsManual(CompactionFilterFactory* compaction_filter_factory) { - return reinterpret_cast( - compaction_filter_factory)->saved_context.is_manual_compaction; - } - CompactionFilter::Context saved_context; - }; + // Compaction range falls after files + Compact(1, "r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); - Options options = CurrentOptions(); - options.memtable_factory.reset( - new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); - options.compaction_style = kCompactionStyleLevel; - options.compaction_filter_factory.reset( - new CompactionFilterFactoryGetContext()); - options.write_buffer_size = 200 << 10; - options.arena_block_size = 4 << 10; - options.level0_file_num_compaction_trigger = 4; - options.num_levels = 4; - options.compression = kNoCompression; - options.max_bytes_for_level_base = 450 << 10; - options.target_file_size_base = 98 << 10; - options.max_grandparent_overlap_factor = 1 << 20; // inf + // Compaction range overlaps files + Compact(1, "p1", "p9"); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); - Reopen(options); + // Populate a different range + MakeTables(3, "c", "e", 1); + ASSERT_EQ("1,1,2", FilesPerLevel(1)); - Random rnd(301); + // Compact just the new range + Compact(1, "b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel(1)); - for (int num = 0; num < 3; num++) { - GenerateNewRandomFile(&rnd); + // Compact all + MakeTables(1, "a", "z", 1); + ASSERT_EQ("1,0,2", FilesPerLevel(1)); + CancelAllBackgroundWork(db_); + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + ASSERT_EQ("1,0,2", FilesPerLevel(1)); + + if (iter == 0) { + options = CurrentOptions(); + options.max_background_flushes = 0; + options.num_levels = 3; + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + } } +} - GenerateNewRandomFile(&rnd); - ASSERT_EQ("0,4", FilesPerLevel(0)); - ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual( - options.compaction_filter_factory.get())); - - GenerateNewRandomFile(&rnd); - ASSERT_EQ("1,4", FilesPerLevel(0)); +TEST_F(DBTest, PreShutdownFlush) { + Options options = CurrentOptions(); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, "key", "value")); + CancelAllBackgroundWork(db_); + Status s = + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + ASSERT_TRUE(s.IsShutdownInProgress()); +} - GenerateNewRandomFile(&rnd); - ASSERT_EQ("2,4", FilesPerLevel(0)); +TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 40; + const int kNumL0Files = 4; - GenerateNewRandomFile(&rnd); - ASSERT_EQ("3,4", FilesPerLevel(0)); + const int kHighPriCount = 3; + const int kLowPriCount = 5; + env_->SetBackgroundThreads(kHighPriCount, Env::HIGH); + env_->SetBackgroundThreads(kLowPriCount, Env::LOW); - GenerateNewRandomFile(&rnd); - ASSERT_EQ("0,4,4", FilesPerLevel(0)); + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = + options.target_file_size_base * kNumL0Files; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.env = env_; + options.enable_thread_tracking = true; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.max_bytes_for_level_multiplier = 2; + options.max_background_compactions = kLowPriCount; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; + options.max_subcompactions = max_subcompactions_; - GenerateNewRandomFile(&rnd); - ASSERT_EQ("1,4,4", FilesPerLevel(0)); + TryReopen(options); + Random rnd(301); - GenerateNewRandomFile(&rnd); - ASSERT_EQ("2,4,4", FilesPerLevel(0)); + std::vector thread_list; + // Delay both flush and compaction + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"}, + {"CompactionJob::Run():Start", + "DBTest::PreShutdownMultipleCompaction:Preshutdown"}, + {"CompactionJob::Run():Start", + "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"}, + {"DBTest::PreShutdownMultipleCompaction:Preshutdown", + "CompactionJob::Run():End"}, + {"CompactionJob::Run():End", + "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}}); - GenerateNewRandomFile(&rnd); - ASSERT_EQ("3,4,4", FilesPerLevel(0)); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - GenerateNewRandomFile(&rnd); - ASSERT_EQ("0,4,8", FilesPerLevel(0)); + // Make rocksdb busy + int key = 0; + // check how many threads are doing compaction using GetThreadList + int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; + for (int file = 0; file < 16 * kNumL0Files; ++file) { + for (int k = 0; k < kEntriesPerBuffer; ++k) { + ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize))); + } - GenerateNewRandomFile(&rnd); - ASSERT_EQ("1,4,8", FilesPerLevel(0)); + Status s = env_->GetThreadList(&thread_list); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } - // compact it three times - for (int i = 0; i < 3; ++i) { - ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); - dbfull()->TEST_WaitForCompact(); + // Speed up the test + if (operation_count[ThreadStatus::OP_FLUSH] > 1 && + operation_count[ThreadStatus::OP_COMPACTION] > + 0.6 * options.max_background_compactions) { + break; + } + if (file == 15 * kNumL0Files) { + TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown"); + } } - // All files are compacted - ASSERT_EQ(0, NumTableFilesAtLevel(0)); - ASSERT_EQ(0, NumTableFilesAtLevel(1)); - - GenerateNewRandomFile(&rnd); - ASSERT_EQ(1, NumTableFilesAtLevel(0)); - - // nonoverlapping with the file on level 0 - Slice start("a"), end("b"); - ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); - dbfull()->TEST_WaitForCompact(); - - // should not compact the level 0 file - ASSERT_EQ(1, NumTableFilesAtLevel(0)); - - start = Slice("j"); - end = Slice("m"); - ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown"); + ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); + CancelAllBackgroundWork(db_); + TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"); dbfull()->TEST_WaitForCompact(); - ASSERT_TRUE(CompactionFilterFactoryGetContext::IsManual( - options.compaction_filter_factory.get())); - - // now it should compact the level 0 file - ASSERT_EQ(0, NumTableFilesAtLevel(0)); - ASSERT_EQ(1, NumTableFilesAtLevel(1)); + // Record the number of compactions at a time. + for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { + operation_count[i] = 0; + } + Status s = env_->GetThreadList(&thread_list); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } + ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); } -TEST_F(DBTest, PromoteL0) { - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.write_buffer_size = 10 * 1024 * 1024; - DestroyAndReopen(options); +TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 40; + const int kNumL0Files = 4; - // non overlapping ranges - std::vector> ranges = { - {81, 160}, {0, 80}, {161, 240}, {241, 320}}; + const int kHighPriCount = 3; + const int kLowPriCount = 5; + env_->SetBackgroundThreads(kHighPriCount, Env::HIGH); + env_->SetBackgroundThreads(kLowPriCount, Env::LOW); - int32_t value_size = 10 * 1024; // 10 KB + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = + options.target_file_size_base * kNumL0Files; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.env = env_; + options.enable_thread_tracking = true; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.max_bytes_for_level_multiplier = 2; + options.max_background_compactions = kLowPriCount; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; + options.max_subcompactions = max_subcompactions_; + TryReopen(options); Random rnd(301); - std::map values; - for (const auto& range : ranges) { - for (int32_t j = range.first; j < range.second; j++) { - values[j] = RandomString(&rnd, value_size); - ASSERT_OK(Put(Key(j), values[j])); - } - ASSERT_OK(Flush()); - } - int32_t level0_files = NumTableFilesAtLevel(0, 0); - ASSERT_EQ(level0_files, ranges.size()); - ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1 + std::vector thread_list; + // Delay both flush and compaction + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"DBTest::PreShutdownCompactionMiddle:Preshutdown", + "CompactionJob::Run():Inprogress"}, + {"CompactionJob::Run():Start", + "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"}, + {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"}, + {"CompactionJob::Run():End", + "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}}); - // Promote L0 level to L2. - ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2)); - // We expect that all the files were trivially moved from L0 to L2 - ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); - ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - for (const auto& kv : values) { - ASSERT_EQ(Get(Key(kv.first)), kv.second); - } -} + // Make rocksdb busy + int key = 0; + // check how many threads are doing compaction using GetThreadList + int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; + for (int file = 0; file < 16 * kNumL0Files; ++file) { + for (int k = 0; k < kEntriesPerBuffer; ++k) { + ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize))); + } -TEST_F(DBTest, PromoteL0Failure) { - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.write_buffer_size = 10 * 1024 * 1024; - DestroyAndReopen(options); + Status s = env_->GetThreadList(&thread_list); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } - // Produce two L0 files with overlapping ranges. - ASSERT_OK(Put(Key(0), "")); - ASSERT_OK(Put(Key(3), "")); - ASSERT_OK(Flush()); - ASSERT_OK(Put(Key(1), "")); - ASSERT_OK(Flush()); + // Speed up the test + if (operation_count[ThreadStatus::OP_FLUSH] > 1 && + operation_count[ThreadStatus::OP_COMPACTION] > + 0.6 * options.max_background_compactions) { + break; + } + if (file == 15 * kNumL0Files) { + TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction"); + } + } - Status status; - // Fails because L0 has overlapping files. - status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); - ASSERT_TRUE(status.IsInvalidArgument()); + ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); + CancelAllBackgroundWork(db_); + TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown"); + TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"); + dbfull()->TEST_WaitForCompact(); + // Record the number of compactions at a time. + for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { + operation_count[i] = 0; + } + Status s = env_->GetThreadList(&thread_list); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } + ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); +} - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - // Now there is a file in L1. - ASSERT_GE(NumTableFilesAtLevel(1, 0), 1); +#endif // ROCKSDB_USING_THREAD_STATUS - ASSERT_OK(Put(Key(5), "")); - ASSERT_OK(Flush()); - // Fails because L1 is non-empty. - status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); - ASSERT_TRUE(status.IsInvalidArgument()); +#ifndef ROCKSDB_LITE +TEST_F(DBTest, FlushOnDestroy) { + WriteOptions wo; + wo.disableWAL = true; + ASSERT_OK(Put("foo", "v1", wo)); + CancelAllBackgroundWork(db_); } -#endif // ROCKSDB_LITE -// Github issue #596 -TEST_F(DBTest, HugeNumberOfLevels) { - Options options = CurrentOptions(); - options.write_buffer_size = 2 * 1024 * 1024; // 2MB - options.max_bytes_for_level_base = 2 * 1024 * 1024; // 2MB - options.num_levels = 12; - options.max_background_compactions = 10; - options.max_bytes_for_level_multiplier = 2; - options.level_compaction_dynamic_level_bytes = true; - DestroyAndReopen(options); +TEST_F(DBTest, DynamicLevelCompressionPerLevel) { + if (!Snappy_Supported()) { + return; + } + const int kNKeys = 120; + int keys[kNKeys]; + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; + } + std::random_shuffle(std::begin(keys), std::end(keys)); Random rnd(301); - for (int i = 0; i < 300000; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); - } + Options options; + options.create_if_missing = true; + options.db_write_buffer_size = 20480; + options.write_buffer_size = 20480; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.target_file_size_base = 2048; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 102400; + options.max_bytes_for_level_multiplier = 4; + options.max_background_compactions = 1; + options.num_levels = 5; - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); -} + options.compression_per_level.resize(3); + options.compression_per_level[0] = kNoCompression; + options.compression_per_level[1] = kNoCompression; + options.compression_per_level[2] = kSnappyCompression; + + OnFileDeletionListener* listener = new OnFileDeletionListener(); + options.listeners.emplace_back(listener); -TEST_F(DBTest, AutomaticConflictsWithManualCompaction) { - Options options = CurrentOptions(); - options.write_buffer_size = 2 * 1024 * 1024; // 2MB - options.max_bytes_for_level_base = 2 * 1024 * 1024; // 2MB - options.num_levels = 12; - options.max_background_compactions = 10; - options.max_bytes_for_level_multiplier = 2; - options.level_compaction_dynamic_level_bytes = true; DestroyAndReopen(options); - Random rnd(301); - for (int i = 0; i < 300000; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should + // be compressed, so total data size should be more than 80K. + for (int i = 0; i < 20; i++) { + ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); } + Flush(); + dbfull()->TEST_WaitForCompact(); - std::atomic callback_count(0); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::BackgroundCompaction()::Conflict", - [&](void* arg) { callback_count.fetch_add(1); }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - CompactRangeOptions croptions; - croptions.exclusive_manual_compaction = false; - ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr)); - ASSERT_GE(callback_count.load(), 1); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - for (int i = 0; i < 300000; ++i) { - ASSERT_NE("NOT_FOUND", Get(Key(i))); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_EQ(NumTableFilesAtLevel(3), 0); + ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U); + + // Insert 400KB. Some data will be compressed + for (int i = 21; i < 120; i++) { + ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); } -} + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4), 120U * 4000U); + // Make sure data in files in L3 is not compacted by removing all files + // in L4 and calculate number of rows + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"}, + })); + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + for (auto file : cf_meta.levels[4].files) { + listener->SetExpectedFileName(dbname_ + file.name); + ASSERT_OK(dbfull()->DeleteFile(file.name)); + } + listener->VerifyMatchedCount(cf_meta.levels[4].files.size()); -// Github issue #595 -// Large write batch with column families -TEST_F(DBTest, LargeBatchWithColumnFamilies) { - Options options = CurrentOptions(); - options.env = env_; - options.write_buffer_size = 100000; // Small write buffer - CreateAndReopenWithCF({"pikachu"}, options); - int64_t j = 0; - for (int i = 0; i < 5; i++) { - for (int pass = 1; pass <= 3; pass++) { - WriteBatch batch; - size_t write_size = 1024 * 1024 * (5 + i); - fprintf(stderr, "prepare: %" ROCKSDB_PRIszt " MB, pass:%d\n", (write_size / 1024 / 1024), - pass); - for (;;) { - std::string data(3000, j++ % 127 + 20); - data += ToString(j); - batch.Put(handles_[0], Slice(data), Slice(data)); - if (batch.GetDataSize() > write_size) { - break; - } - } - fprintf(stderr, "write: %" ROCKSDB_PRIszt " MB\n", (batch.GetDataSize() / 1024 / 1024)); - ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); - fprintf(stderr, "done\n"); - } + int num_keys = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + num_keys++; } - // make sure we can re-open it. - ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); + ASSERT_OK(iter->status()); + ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U); } -// Make sure that Flushes can proceed in parallel with CompactRange() -TEST_F(DBTest, FlushesInParallelWithCompactRange) { - // iter == 0 -- leveled - // iter == 1 -- leveled, but throw in a flush between two levels compacting - // iter == 2 -- universal - for (int iter = 0; iter < 3; ++iter) { - Options options = CurrentOptions(); - if (iter < 2) { - options.compaction_style = kCompactionStyleLevel; - } else { - options.compaction_style = kCompactionStyleUniversal; - } - options.write_buffer_size = 110 << 10; - options.level0_file_num_compaction_trigger = 4; - options.num_levels = 4; - options.compression = kNoCompression; - options.max_bytes_for_level_base = 450 << 10; - options.target_file_size_base = 98 << 10; - options.max_write_buffer_number = 2; - - DestroyAndReopen(options); +TEST_F(DBTest, DynamicLevelCompressionPerLevel2) { + if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) { + return; + } + const int kNKeys = 500; + int keys[kNKeys]; + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; + } + std::random_shuffle(std::begin(keys), std::end(keys)); - Random rnd(301); - for (int num = 0; num < 14; num++) { - GenerateNewRandomFile(&rnd); - } + Random rnd(301); + Options options; + options.create_if_missing = true; + options.db_write_buffer_size = 6000; + options.write_buffer_size = 6000; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.soft_pending_compaction_bytes_limit = 1024 * 1024; - if (iter == 1) { - rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::RunManualCompaction()::1", - "DBTest::FlushesInParallelWithCompactRange:1"}, - {"DBTest::FlushesInParallelWithCompactRange:2", - "DBImpl::RunManualCompaction()::2"}}); - } else { - rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"CompactionJob::Run():Start", - "DBTest::FlushesInParallelWithCompactRange:1"}, - {"DBTest::FlushesInParallelWithCompactRange:2", - "CompactionJob::Run():End"}}); - } - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // Use file size to distinguish levels + // L1: 10, L2: 20, L3 40, L4 80 + // L0 is less than 30 + options.target_file_size_base = 10; + options.target_file_size_multiplier = 2; - std::vector threads; - threads.emplace_back([&]() { Compact("a", "z"); }); + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 200; + options.max_bytes_for_level_multiplier = 8; + options.max_background_compactions = 1; + options.num_levels = 5; + std::shared_ptr mtf(new mock::MockTableFactory); + options.table_factory = mtf; - TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1"); + options.compression_per_level.resize(3); + options.compression_per_level[0] = kNoCompression; + options.compression_per_level[1] = kLZ4Compression; + options.compression_per_level[2] = kZlibCompression; - // this has to start a flush. if flushes are blocked, this will try to - // create - // 3 memtables, and that will fail because max_write_buffer_number is 2 - for (int num = 0; num < 3; num++) { - GenerateNewRandomFile(&rnd, /* nowait */ true); - } + DestroyAndReopen(options); + // When base level is L4, L4 is LZ4. + std::atomic num_zlib(0); + std::atomic num_lz4(0); + std::atomic num_no(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + if (compaction->output_level() == 4) { + ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); + num_lz4.fetch_add(1); + } + }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { + auto* compression = reinterpret_cast(arg); + ASSERT_TRUE(*compression == kNoCompression); + num_no.fetch_add(1); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2"); + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200))); - for (auto& t : threads) { - t.join(); + if (i % 25 == 0) { + dbfull()->TEST_WaitForFlushMemTable(); } - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } -} - -TEST_F(DBTest, DelayedWriteRate) { - const int kEntriesPerMemTable = 100; - const int kTotalFlushes = 20; - - Options options = CurrentOptions(); - env_->SetBackgroundThreads(1, Env::LOW); - options.env = env_; - env_->no_sleep_ = true; - options.write_buffer_size = 100000000; - options.max_write_buffer_number = 256; - options.max_background_compactions = 1; - options.level0_file_num_compaction_trigger = 3; - options.level0_slowdown_writes_trigger = 3; - options.level0_stop_writes_trigger = 999999; - options.delayed_write_rate = 20000000; // Start with 200MB/s - options.memtable_factory.reset( - new SpecialSkipListFactory(kEntriesPerMemTable)); - CreateAndReopenWithCF({"pikachu"}, options); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); - // Block compactions - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_EQ(NumTableFilesAtLevel(3), 0); + ASSERT_GT(NumTableFilesAtLevel(4), 0); + ASSERT_GT(num_no.load(), 2); + ASSERT_GT(num_lz4.load(), 0); + int prev_num_files_l4 = NumTableFilesAtLevel(4); - for (int i = 0; i < 3; i++) { - Put(Key(i), std::string(10000, 'x')); - Flush(); - } + // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib + num_lz4.store(0); + num_no.store(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + if (compaction->output_level() == 4 && compaction->start_level() == 3) { + ASSERT_TRUE(compaction->output_compression() == kZlibCompression); + num_zlib.fetch_add(1); + } else { + ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); + num_lz4.fetch_add(1); + } + }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { + auto* compression = reinterpret_cast(arg); + ASSERT_TRUE(*compression == kNoCompression); + num_no.fetch_add(1); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - // These writes will be slowed down to 1KB/s - uint64_t estimated_sleep_time = 0; - Random rnd(301); - Put("", ""); - uint64_t cur_rate = options.delayed_write_rate; - for (int i = 0; i < kTotalFlushes; i++) { - uint64_t size_memtable = 0; - for (int j = 0; j < kEntriesPerMemTable; j++) { - auto rand_num = rnd.Uniform(20); - // Spread the size range to more. - size_t entry_size = rand_num * rand_num * rand_num; - WriteOptions wo; - Put(Key(i), std::string(entry_size, 'x'), wo); - size_memtable += entry_size + 18; - // Occasionally sleep a while - if (rnd.Uniform(20) == 6) { - env_->SleepForMicroseconds(2666); - } + for (int i = 101; i < 500; i++) { + ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200))); + if (i % 100 == 99) { + Flush(); + dbfull()->TEST_WaitForCompact(); } - dbfull()->TEST_WaitForFlushMemTable(); - estimated_sleep_time += size_memtable * 1000000u / cur_rate; - // Slow down twice. One for memtable switch and one for flush finishes. - cur_rate = static_cast(static_cast(cur_rate) / - kSlowdownRatio / kSlowdownRatio); } - // Estimate the total sleep time fall into the rough range. - ASSERT_GT(env_->addon_time_.load(), - static_cast(estimated_sleep_time / 2)); - ASSERT_LT(env_->addon_time_.load(), - static_cast(estimated_sleep_time * 2)); - env_->no_sleep_ = false; + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_GT(NumTableFilesAtLevel(3), 0); + ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4); + ASSERT_GT(num_no.load(), 2); + ASSERT_GT(num_lz4.load(), 0); + ASSERT_GT(num_zlib.load(), 0); } -TEST_F(DBTest, HardLimit) { - Options options = CurrentOptions(); +TEST_F(DBTest, DynamicCompactionOptions) { + // minimum write buffer size is enforced at 64KB + const uint64_t k32KB = 1 << 15; + const uint64_t k64KB = 1 << 16; + const uint64_t k128KB = 1 << 17; + const uint64_t k1MB = 1 << 20; + const uint64_t k4KB = 1 << 12; + Options options; options.env = env_; - env_->SetBackgroundThreads(1, Env::LOW); - options.max_write_buffer_number = 256; - options.write_buffer_size = 110 << 10; // 110KB - options.arena_block_size = 4 * 1024; - options.level0_file_num_compaction_trigger = 4; - options.level0_slowdown_writes_trigger = 999999; - options.level0_stop_writes_trigger = 999999; - options.hard_pending_compaction_bytes_limit = 800 << 10; - options.max_bytes_for_level_base = 10000000000u; - options.max_background_compactions = 1; - options.memtable_factory.reset( - new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.create_if_missing = true; + options.compression = kNoCompression; + options.soft_pending_compaction_bytes_limit = 1024 * 1024; + options.write_buffer_size = k64KB; + options.arena_block_size = 4 * k4KB; + options.max_write_buffer_number = 2; + // Compaction related options + options.level0_file_num_compaction_trigger = 3; + options.level0_slowdown_writes_trigger = 4; + options.level0_stop_writes_trigger = 8; + options.max_grandparent_overlap_factor = 10; + options.expanded_compaction_factor = 25; + options.source_compaction_factor = 1; + options.target_file_size_base = k64KB; + options.target_file_size_multiplier = 1; + options.max_bytes_for_level_base = k128KB; + options.max_bytes_for_level_multiplier = 4; + // Block flush thread and disable compaction thread env_->SetBackgroundThreads(1, Env::LOW); - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); + auto gen_l0_kb = [this](int start, int size, int stride) { + Random rnd(301); + for (int i = 0; i < size; i++) { + ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024))); + } + dbfull()->TEST_WaitForFlushMemTable(); + }; - std::atomic callback_count(0); - rocksdb::SyncPoint::GetInstance()->SetCallBack("DBImpl::DelayWrite:Wait", - [&](void* arg) { - callback_count.fetch_add(1); - sleeping_task_low.WakeUp(); - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // Write 3 files that have the same key range. + // Since level0_file_num_compaction_trigger is 3, compaction should be + // triggered. The compaction should result in one L1 file + gen_l0_kb(0, 64, 1); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + gen_l0_kb(0, 64, 1); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + gen_l0_kb(0, 64, 1); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,1", FilesPerLevel()); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(1U, metadata.size()); + ASSERT_LE(metadata[0].size, k64KB + k4KB); + ASSERT_GE(metadata[0].size, k64KB - k4KB); - Random rnd(301); - int key_idx = 0; - for (int num = 0; num < 5; num++) { - GenerateNewFile(&rnd, &key_idx, true); - dbfull()->TEST_WaitForFlushMemTable(); + // Test compaction trigger and target_file_size_base + // Reduce compaction trigger to 2, and reduce L1 file size to 32KB. + // Writing to 64KB L0 files should trigger a compaction. Since these + // 2 L0 files have the same key range, compaction merge them and should + // result in 2 32KB L1 files. + ASSERT_OK(dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}, + {"target_file_size_base", ToString(k32KB)}})); + + gen_l0_kb(0, 64, 1); + ASSERT_EQ("1,1", FilesPerLevel()); + gen_l0_kb(0, 64, 1); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,2", FilesPerLevel()); + metadata.clear(); + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(2U, metadata.size()); + ASSERT_LE(metadata[0].size, k32KB + k4KB); + ASSERT_GE(metadata[0].size, k32KB - k4KB); + ASSERT_LE(metadata[1].size, k32KB + k4KB); + ASSERT_GE(metadata[1].size, k32KB - k4KB); + + // Test max_bytes_for_level_base + // Increase level base size to 256KB and write enough data that will + // fill L1 and L2. L1 size should be around 256KB while L2 size should be + // around 256KB x 4. + ASSERT_OK( + dbfull()->SetOptions({{"max_bytes_for_level_base", ToString(k1MB)}})); + + // writing 96 x 64KB => 6 * 1024KB + // (L1 + L2) = (1 + 4) * 1024KB + for (int i = 0; i < 96; ++i) { + gen_l0_kb(i, 64, 96); } + dbfull()->TEST_WaitForCompact(); + ASSERT_GT(SizeAtLevel(1), k1MB / 2); + ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2); - ASSERT_EQ(0, callback_count.load()); + // Within (0.5, 1.5) of 4MB. + ASSERT_GT(SizeAtLevel(2), 2 * k1MB); + ASSERT_LT(SizeAtLevel(2), 6 * k1MB); - for (int num = 0; num < 5; num++) { - GenerateNewFile(&rnd, &key_idx, true); - dbfull()->TEST_WaitForFlushMemTable(); + // Test max_bytes_for_level_multiplier and + // max_bytes_for_level_base. Now, reduce both mulitplier and level base, + // After filling enough data that can fit in L1 - L3, we should see L1 size + // reduces to 128KB from 256KB which was asserted previously. Same for L2. + ASSERT_OK( + dbfull()->SetOptions({{"max_bytes_for_level_multiplier", "2"}, + {"max_bytes_for_level_base", ToString(k128KB)}})); + + // writing 20 x 64KB = 10 x 128KB + // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB + for (int i = 0; i < 20; ++i) { + gen_l0_kb(i, 64, 32); + } + dbfull()->TEST_WaitForCompact(); + uint64_t total_size = SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3); + ASSERT_TRUE(total_size < k128KB * 7 * 1.5); + + // Test level0_stop_writes_trigger. + // Clean up memtable and L0. Block compaction threads. If continue to write + // and flush memtables. We should see put stop after 8 memtable flushes + // since level0_stop_writes_trigger = 8 + dbfull()->TEST_FlushMemTable(true); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + // Block compaction + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + int count = 0; + Random rnd(301); + WriteOptions wo; + while (count < 64) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); + dbfull()->TEST_FlushMemTable(true); + count++; + if (dbfull()->TEST_write_controler().IsStopped()) { + sleeping_task_low.WakeUp(); + break; + } } - ASSERT_GE(callback_count.load(), 1); - - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + // Stop trigger = 8 + ASSERT_EQ(count, 8); + // Unblock sleeping_task_low.WaitUntilDone(); -} - -#ifndef ROCKSDB_LITE -TEST_F(DBTest, SoftLimit) { - Options options = CurrentOptions(); - options.env = env_; - options.write_buffer_size = 100000; // Small write buffer - options.max_write_buffer_number = 256; - options.level0_file_num_compaction_trigger = 1; - options.level0_slowdown_writes_trigger = 3; - options.level0_stop_writes_trigger = 999999; - options.delayed_write_rate = 20000; // About 200KB/s limited rate - options.soft_pending_compaction_bytes_limit = 200000; - options.target_file_size_base = 99999999; // All into one file - options.max_bytes_for_level_base = 50000; - options.max_bytes_for_level_multiplier = 10; - options.max_background_compactions = 1; - options.compression = kNoCompression; - Reopen(options); - Put(Key(0), ""); + // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0. + // Block compaction thread again. Perform the put and memtable flushes + // until we see the stop after 6 memtable flushes. + ASSERT_OK(dbfull()->SetOptions({{"level0_stop_writes_trigger", "6"}})); + dbfull()->TEST_FlushMemTable(true); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); - test::SleepingBackgroundTask sleeping_task_low; - // Block compactions + // Block compaction again + sleeping_task_low.Reset(); env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, Env::Priority::LOW); sleeping_task_low.WaitUntilSleeping(); - - // Create 3 L0 files, making score of L0 to be 3. - for (int i = 0; i < 3; i++) { - Put(Key(i), std::string(5000, 'x')); - Put(Key(100 - i), std::string(5000, 'x')); - // Flush the file. File size is around 30KB. - Flush(); + count = 0; + while (count < 64) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); + dbfull()->TEST_FlushMemTable(true); + count++; + if (dbfull()->TEST_write_controler().IsStopped()) { + sleeping_task_low.WakeUp(); + break; + } } - ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - - sleeping_task_low.WakeUp(); + ASSERT_EQ(count, 6); + // Unblock sleeping_task_low.WaitUntilDone(); - sleeping_task_low.Reset(); - dbfull()->TEST_WaitForCompact(); - // Now there is one L1 file but doesn't trigger soft_rate_limit - // The L1 file size is around 30KB. - ASSERT_EQ(NumTableFilesAtLevel(1), 1); - ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + // Test disable_auto_compactions + // Compaction thread is unblocked but auto compaction is disabled. Write + // 4 L0 files and compaction should be triggered. If auto compaction is + // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of + // L0 files do not change after the call. + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}})); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); - // Only allow one compactin going through. - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "BackgroundCallCompaction:0", [&](void* arg) { - // Schedule a sleeping task. - sleeping_task_low.Reset(); - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_low, Env::Priority::LOW); - }); + for (int i = 0; i < 4; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + // Wait for compaction so that put won't stop + dbfull()->TEST_FlushMemTable(true); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(0), 4); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // Enable auto compaction and perform the same test, # of L0 files should be + // reduced after compaction. + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - sleeping_task_low.WaitUntilSleeping(); - // Create 3 L0 files, making score of L0 to be 3 - for (int i = 0; i < 3; i++) { - Put(Key(10 + i), std::string(5000, 'x')); - Put(Key(90 - i), std::string(5000, 'x')); - // Flush the file. File size is around 30KB. - Flush(); + for (int i = 0; i < 4; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + // Wait for compaction so that put won't stop + dbfull()->TEST_FlushMemTable(true); } + dbfull()->TEST_WaitForCompact(); + ASSERT_LT(NumTableFilesAtLevel(0), 4); +} +#endif // ROCKSDB_LITE - // Wake up sleep task to enable compaction to run and waits - // for it to go to sleep state again to make sure one compaction - // goes through. - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilSleeping(); +TEST_F(DBTest, FileCreationRandomFailure) { + Options options; + options.env = env_; + options.create_if_missing = true; + options.write_buffer_size = 100000; // Small write buffer + options.target_file_size_base = 200000; + options.max_bytes_for_level_base = 1000000; + options.max_bytes_for_level_multiplier = 2; - // Now there is one L1 file (around 60KB) which exceeds 50KB base by 10KB - // Given level multiplier 10, estimated pending compaction is around 100KB - // doesn't trigger soft_pending_compaction_bytes_limit - ASSERT_EQ(NumTableFilesAtLevel(1), 1); - ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + DestroyAndReopen(options); + Random rnd(301); - // Create 3 L0 files, making score of L0 to be 3, higher than L0. - for (int i = 0; i < 3; i++) { - Put(Key(20 + i), std::string(5000, 'x')); - Put(Key(80 - i), std::string(5000, 'x')); - // Flush the file. File size is around 30KB. - Flush(); + const int kCDTKeysPerBuffer = 4; + const int kTestSize = kCDTKeysPerBuffer * 4096; + const int kTotalIteration = 100; + // the second half of the test involves in random failure + // of file creation. + const int kRandomFailureTest = kTotalIteration / 2; + std::vector values; + for (int i = 0; i < kTestSize; ++i) { + values.push_back("NOT_FOUND"); + } + for (int j = 0; j < kTotalIteration; ++j) { + if (j == kRandomFailureTest) { + env_->non_writeable_rate_.store(90); + } + for (int k = 0; k < kTestSize; ++k) { + // here we expect some of the Put fails. + std::string value = RandomString(&rnd, 100); + Status s = Put(Key(k), Slice(value)); + if (s.ok()) { + // update the latest successful put + values[k] = value; + } + // But everything before we simulate the failure-test should succeed. + if (j < kRandomFailureTest) { + ASSERT_OK(s); + } + } } - // Wake up sleep task to enable compaction to run and waits - // for it to go to sleep state again to make sure one compaction - // goes through. - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilSleeping(); - // Now there is one L1 file (around 90KB) which exceeds 50KB base by 40KB - // Given level multiplier 10, estimated pending compaction is around 400KB - // triggerring soft_pending_compaction_bytes_limit - ASSERT_EQ(NumTableFilesAtLevel(1), 1); - ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + // If rocksdb does not do the correct job, internal assert will fail here. + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilSleeping(); + // verify we have the latest successful update + for (int k = 0; k < kTestSize; ++k) { + auto v = Get(Key(k)); + ASSERT_EQ(v, values[k]); + } - ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + // reopen and reverify we have the latest successful update + env_->non_writeable_rate_.store(0); + Reopen(options); + for (int k = 0; k < kTestSize; ++k) { + auto v = Get(Key(k)); + ASSERT_EQ(v, values[k]); + } +} - // shrink level base so L2 will hit soft limit easier. - ASSERT_OK(dbfull()->SetOptions({ - {"max_bytes_for_level_base", "5000"}, - })); +#ifndef ROCKSDB_LITE +TEST_F(DBTest, DynamicMiscOptions) { + // Test max_sequential_skip_in_iterations + Options options; + options.env = env_; + options.create_if_missing = true; + options.max_sequential_skip_in_iterations = 16; + options.compression = kNoCompression; + options.statistics = rocksdb::CreateDBStatistics(); + DestroyAndReopen(options); - Put("", ""); - Flush(); - ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + auto assert_reseek_count = [this, &options](int key_start, int num_reseek) { + int key0 = key_start; + int key1 = key_start + 1; + int key2 = key_start + 2; + Random rnd(301); + ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8))); + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8))); + } + ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8))); + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(key1)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Key(key1)), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Key(key2)), 0); + ASSERT_EQ(num_reseek, + TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION)); + }; + // No reseek + assert_reseek_count(100, 0); - sleeping_task_low.WaitUntilSleeping(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); + ASSERT_OK(dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "4"}})); + // Clear memtable and make new option effective + dbfull()->TEST_FlushMemTable(true); + // Trigger reseek + assert_reseek_count(200, 1); + + ASSERT_OK( + dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "16"}})); + // Clear memtable and make new option effective + dbfull()->TEST_FlushMemTable(true); + // No reseek + assert_reseek_count(300, 1); } +#endif // ROCKSDB_LITE + +TEST_F(DBTest, L0L1L2AndUpHitCounter) { + Options options = CurrentOptions(); + options.write_buffer_size = 32 * 1024; + options.target_file_size_base = 32 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.max_bytes_for_level_base = 64 * 1024; + options.max_write_buffer_number = 2; + options.max_background_compactions = 8; + options.max_background_flushes = 8; + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"mypikachu"}, options); -TEST_F(DBTest, LastWriteBufferDelay) { - Options options = CurrentOptions(); - options.env = env_; - options.write_buffer_size = 100000; - options.max_write_buffer_number = 4; - options.delayed_write_rate = 20000; - options.compression = kNoCompression; - options.disable_auto_compactions = true; - int kNumKeysPerMemtable = 3; - options.memtable_factory.reset( - new SpecialSkipListFactory(kNumKeysPerMemtable)); + int numkeys = 20000; + for (int i = 0; i < numkeys; i++) { + ASSERT_OK(Put(1, Key(i), "val")); + } + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); - Reopen(options); - test::SleepingBackgroundTask sleeping_task; - // Block flushes - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, - Env::Priority::HIGH); - sleeping_task.WaitUntilSleeping(); + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); - // Create 3 L0 files, making score of L0 to be 3. - for (int i = 0; i < 3; i++) { - // Fill one mem table - for (int j = 0; j < kNumKeysPerMemtable; j++) { - Put(Key(j), ""); - } - ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + for (int i = 0; i < numkeys; i++) { + ASSERT_EQ(Get(1, Key(i)), "val"); } - // Inserting a new entry would create a new mem table, triggering slow down. - Put(Key(0), ""); - ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - sleeping_task.WakeUp(); - sleeping_task.WaitUntilDone(); + ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100); + ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100); + ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100); + + ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) + + TestGetTickerCount(options, GET_HIT_L1) + + TestGetTickerCount(options, GET_HIT_L2_AND_UP)); } -#endif // ROCKSDB_LITE -TEST_F(DBTest, FailWhenCompressionNotSupportedTest) { +TEST_F(DBTest, EncodeDecompressedBlockSizeTest) { + // iter 0 -- zlib + // iter 1 -- bzip2 + // iter 2 -- lz4 + // iter 3 -- lz4HC CompressionType compressions[] = {kZlibCompression, kBZip2Compression, kLZ4Compression, kLZ4HCCompression}; for (int iter = 0; iter < 4; ++iter) { if (!CompressionTypeSupported(compressions[iter])) { - // not supported, we should fail the Open() + continue; + } + // first_table_version 1 -- generate with table_version == 1, read with + // table_version == 2 + // first_table_version 2 -- generate with table_version == 2, read with + // table_version == 1 + for (int first_table_version = 1; first_table_version <= 2; + ++first_table_version) { + BlockBasedTableOptions table_options; + table_options.format_version = first_table_version; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); Options options = CurrentOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; options.compression = compressions[iter]; - ASSERT_TRUE(!TryReopen(options).ok()); - // Try if CreateColumnFamily also fails - options.compression = kNoCompression; - ASSERT_OK(TryReopen(options)); - ColumnFamilyOptions cf_options(options); - cf_options.compression = compressions[iter]; - ColumnFamilyHandle* handle; - ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok()); + DestroyAndReopen(options); + + int kNumKeysWritten = 100000; + + Random rnd(301); + for (int i = 0; i < kNumKeysWritten; ++i) { + // compressible string + ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a'))); + } + + table_options.format_version = first_table_version == 1 ? 2 : 1; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + ASSERT_EQ(r.substr(128), std::string(128, 'a')); + } } } } -#ifndef ROCKSDB_LITE -TEST_F(DBTest, RowCache) { +TEST_F(DBTest, MutexWaitStatsDisabledByDefault) { Options options = CurrentOptions(); + options.create_if_missing = true; options.statistics = rocksdb::CreateDBStatistics(); - options.row_cache = NewLRUCache(8192); - DestroyAndReopen(options); - - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Flush()); - - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); - ASSERT_EQ(Get("foo"), "bar"); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); - ASSERT_EQ(Get("foo"), "bar"); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + CreateAndReopenWithCF({"pikachu"}, options); + const uint64_t kMutexWaitDelay = 100; + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, + kMutexWaitDelay); + ASSERT_OK(Put("hello", "rocksdb")); + ASSERT_EQ(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), 0); + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0); } -#endif // ROCKSDB_LITE -// TODO(3.13): fix the issue of Seek() + Prev() which might not necessary -// return the biggest key which is smaller than the seek key. -TEST_F(DBTest, PrevAfterMerge) { - Options options; +TEST_F(DBTest, MutexWaitStats) { + Options options = CurrentOptions(); options.create_if_missing = true; - options.merge_operator = MergeOperators::CreatePutOperator(); - DestroyAndReopen(options); - - // write three entries with different keys using Merge() - WriteOptions wopts; - db_->Merge(wopts, "1", "data1"); - db_->Merge(wopts, "2", "data2"); - db_->Merge(wopts, "3", "data3"); - - std::unique_ptr it(db_->NewIterator(ReadOptions())); - - it->Seek("2"); - ASSERT_TRUE(it->Valid()); - ASSERT_EQ("2", it->key().ToString()); - - it->Prev(); - ASSERT_TRUE(it->Valid()); - ASSERT_EQ("1", it->key().ToString()); + options.statistics = rocksdb::CreateDBStatistics(); + options.statistics->stats_level_ = StatsLevel::kAll; + CreateAndReopenWithCF({"pikachu"}, options); + const uint64_t kMutexWaitDelay = 100; + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, + kMutexWaitDelay); + ASSERT_OK(Put("hello", "rocksdb")); + ASSERT_GE(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay); + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0); } -TEST_F(DBTest, DeletingOldWalAfterDrop) { - rocksdb::SyncPoint::GetInstance()->LoadDependency( - { { "Test:AllowFlushes", "DBImpl::BGWorkFlush" }, - { "DBImpl::BGWorkFlush:done", "Test:WaitForFlush"} }); - rocksdb::SyncPoint::GetInstance()->ClearTrace(); - - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +TEST_F(DBTest, CloseSpeedup) { Options options = CurrentOptions(); - options.max_total_wal_size = 8192; - options.compression = kNoCompression; - options.write_buffer_size = 1 << 20; - options.level0_file_num_compaction_trigger = (1<<30); - options.level0_slowdown_writes_trigger = (1<<30); - options.level0_stop_writes_trigger = (1<<30); - options.disable_auto_compactions = true; - DestroyAndReopen(options); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - - CreateColumnFamilies({"cf1", "cf2"}, options); - ASSERT_OK(Put(0, "key1", DummyString(8192))); - ASSERT_OK(Put(0, "key2", DummyString(8192))); - // the oldest wal should now be getting_flushed - ASSERT_OK(db_->DropColumnFamily(handles_[0])); - // all flushes should now do nothing because their CF is dropped - TEST_SYNC_POINT("Test:AllowFlushes"); - TEST_SYNC_POINT("Test:WaitForFlush"); - uint64_t lognum1 = dbfull()->TEST_LogfileNumber(); - ASSERT_OK(Put(1, "key3", DummyString(8192))); - ASSERT_OK(Put(1, "key4", DummyString(8192))); - // new wal should have been created - uint64_t lognum2 = dbfull()->TEST_LogfileNumber(); - EXPECT_GT(lognum2, lognum1); -} - -TEST_F(DBTest, DBWithSstFileManager) { - std::shared_ptr sst_file_manager(NewSstFileManager(env_)); - auto sfm = static_cast(sst_file_manager.get()); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_write_buffer_number = 16; - int files_added = 0; - int files_deleted = 0; - int files_moved = 0; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "SstFileManagerImpl::OnAddFile", [&](void* arg) { files_added++; }); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "SstFileManagerImpl::OnDeleteFile", [&](void* arg) { files_deleted++; }); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "SstFileManagerImpl::OnMoveFile", [&](void* arg) { files_moved++; }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // Block background threads + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); - Options options = CurrentOptions(); - options.sst_file_manager = sst_file_manager; + std::vector filenames; + env_->GetChildren(dbname_, &filenames); + // Delete archival files. + for (size_t i = 0; i < filenames.size(); ++i) { + env_->DeleteFile(dbname_ + "/" + filenames[i]); + } + env_->DeleteDir(dbname_); DestroyAndReopen(options); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); Random rnd(301); - for (int i = 0; i < 25; i++) { - GenerateNewRandomFile(&rnd); - ASSERT_OK(Flush()); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - // Verify that we are tracking all sst files in dbname_ - ASSERT_EQ(sfm->GetTrackedFiles(), GetAllSSTFiles()); - } - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + int key_idx = 0; - auto files_in_db = GetAllSSTFiles(); - // Verify that we are tracking all sst files in dbname_ - ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); - // Verify the total files size - uint64_t total_files_size = 0; - for (auto& file_to_size : files_in_db) { - total_files_size += file_to_size.second; - } - ASSERT_EQ(sfm->GetTotalSize(), total_files_size); - // We flushed at least 25 files - ASSERT_GE(files_added, 25); - // Compaction must have deleted some files - ASSERT_GT(files_deleted, 0); - // No files were moved - ASSERT_EQ(files_moved, 0); + // First three 110KB files are not going to level 2 + // After that, (100K, 200K) + for (int num = 0; num < 5; num++) { + GenerateNewFile(&rnd, &key_idx, true); + } - Close(); - Reopen(options); - ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); - ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + ASSERT_EQ(0, GetSstFileCount(dbname_)); - // Verify that we track all the files again after the DB is closed and opened Close(); - sst_file_manager.reset(NewSstFileManager(env_)); - options.sst_file_manager = sst_file_manager; - sfm = static_cast(sst_file_manager.get()); + ASSERT_EQ(0, GetSstFileCount(dbname_)); - Reopen(options); - ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); - ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + // Unblock background threads + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); } -#ifndef ROCKSDB_LITE -TEST_F(DBTest, RateLimitedDelete) { - rocksdb::SyncPoint::GetInstance()->LoadDependency({ - {"DBTest::RateLimitedDelete:1", "DeleteScheduler::BackgroundEmptyTrash"}, - }); - - std::vector penalties; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteScheduler::BackgroundEmptyTrash:Wait", - [&](void* arg) { penalties.push_back(*(static_cast(arg))); }); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.env = env_; - - std::string trash_dir = test::TmpDir(env_) + "/trash"; - int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec - Status s; - options.sst_file_manager.reset(NewSstFileManager( - env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s)); - ASSERT_OK(s); - auto sfm = static_cast(options.sst_file_manager.get()); - - Destroy(last_options_); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - ASSERT_OK(TryReopen(options)); - // Create 4 files in L0 - for (char v = 'a'; v <= 'd'; v++) { - ASSERT_OK(Put("Key2", DummyString(1024, v))); - ASSERT_OK(Put("Key3", DummyString(1024, v))); - ASSERT_OK(Put("Key4", DummyString(1024, v))); - ASSERT_OK(Put("Key1", DummyString(1024, v))); - ASSERT_OK(Put("Key4", DummyString(1024, v))); - ASSERT_OK(Flush()); - } - // We created 4 sst files in L0 - ASSERT_EQ("4", FilesPerLevel(0)); - - std::vector metadata; - db_->GetLiveFilesMetaData(&metadata); - - // Compaction will move the 4 files in L0 to trash and create 1 L1 file - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ("0,1", FilesPerLevel(0)); - - uint64_t delete_start_time = env_->NowMicros(); - // Hold BackgroundEmptyTrash - TEST_SYNC_POINT("DBTest::RateLimitedDelete:1"); - sfm->WaitForEmptyTrash(); - uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time; +class DelayedMergeOperator : public MergeOperator { + private: + DBTest* db_test_; - uint64_t total_files_size = 0; - uint64_t expected_penlty = 0; - ASSERT_EQ(penalties.size(), metadata.size()); - for (size_t i = 0; i < metadata.size(); i++) { - total_files_size += metadata[i].size; - expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec); - ASSERT_EQ(expected_penlty, penalties[i]); + public: + explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {} + virtual bool FullMerge(const Slice& key, const Slice* existing_value, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const override { + db_test_->env_->addon_time_.fetch_add(1000); + *new_value = ""; + return true; } - ASSERT_GT(time_spent_deleting, expected_penlty * 0.9); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); -} + virtual const char* Name() const override { return "DelayedMergeOperator"; } +}; -// Create a DB with 2 db_paths, and generate multiple files in the 2 -// db_paths using CompactRangeOptions, make sure that files that were -// deleted from first db_path were deleted using DeleteScheduler and -// files in the second path were not. -TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) { - int bg_delete_file = 0; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteScheduler::DeleteTrashFile:DeleteFile", - [&](void* arg) { bg_delete_file++; }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +TEST_F(DBTest, MergeTestTime) { + std::string one, two, three; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + // Enable time profiling + SetPerfLevel(kEnableTime); + this->env_->addon_time_.store(0); + this->env_->time_elapse_only_sleep_ = true; + this->env_->no_sleep_ = true; Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.db_paths.emplace_back(dbname_, 1024 * 100); - options.db_paths.emplace_back(dbname_ + "_2", 1024 * 100); - options.env = env_; - - std::string trash_dir = test::TmpDir(env_) + "/trash"; - int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec - Status s; - options.sst_file_manager.reset(NewSstFileManager( - env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s)); - ASSERT_OK(s); - auto sfm = static_cast(options.sst_file_manager.get()); - + options.statistics = rocksdb::CreateDBStatistics(); + options.merge_operator.reset(new DelayedMergeOperator(this)); DestroyAndReopen(options); - // Create 4 files in L0 - for (int i = 0; i < 4; i++) { - ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A'))); - ASSERT_OK(Flush()); - } - // We created 4 sst files in L0 - ASSERT_EQ("4", FilesPerLevel(0)); - // Compaction will delete files from L0 in first db path and generate a new - // file in L1 in second db path - CompactRangeOptions compact_options; - compact_options.target_path_id = 1; - Slice begin("Key0"); - Slice end("Key3"); - ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); - ASSERT_EQ("0,1", FilesPerLevel(0)); - - // Create 4 files in L0 - for (int i = 4; i < 8; i++) { - ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'B'))); - ASSERT_OK(Flush()); - } - ASSERT_EQ("4,1", FilesPerLevel(0)); - - // Compaction will delete files from L0 in first db path and generate a new - // file in L1 in second db path - begin = "Key4"; - end = "Key7"; - ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); - ASSERT_EQ("0,2", FilesPerLevel(0)); + ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); + db_->Put(WriteOptions(), "foo", one); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", two)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", three)); + ASSERT_OK(Flush()); - sfm->WaitForEmptyTrash(); - ASSERT_EQ(bg_delete_file, 8); + ReadOptions opt; + opt.verify_checksums = true; + opt.snapshot = nullptr; + std::string result; + db_->Get(opt, "foo", &result); - compact_options.bottommost_level_compaction = - BottommostLevelCompaction::kForce; - ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); - ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(1000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); - sfm->WaitForEmptyTrash(); - ASSERT_EQ(bg_delete_file, 8); + ReadOptions read_options; + std::unique_ptr iter(db_->NewIterator(read_options)); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ++count; + } - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(1, count); + ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); +#if ROCKSDB_USING_THREAD_STATUS + ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0); +#endif // ROCKSDB_USING_THREAD_STATUS + this->env_->time_elapse_only_sleep_ = false; } -TEST_F(DBTest, DestroyDBWithRateLimitedDelete) { - int bg_delete_file = 0; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteScheduler::DeleteTrashFile:DeleteFile", - [&](void* arg) { bg_delete_file++; }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - +#ifndef ROCKSDB_LITE +TEST_P(DBTestWithParam, MergeCompactionTimeTest) { + SetPerfLevel(kEnableTime); Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.env = env_; + options.compaction_filter_factory = std::make_shared(); + options.statistics = rocksdb::CreateDBStatistics(); + options.merge_operator.reset(new DelayedMergeOperator(this)); + options.compaction_style = kCompactionStyleUniversal; + options.max_subcompactions = max_subcompactions_; DestroyAndReopen(options); - // Create 4 files in L0 - for (int i = 0; i < 4; i++) { - ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A'))); + for (int i = 0; i < 1000; i++) { + ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST")); ASSERT_OK(Flush()); } - // We created 4 sst files in L0 - ASSERT_EQ("4", FilesPerLevel(0)); - - // Close DB and destroy it using DeleteScheduler - Close(); - std::string trash_dir = test::TmpDir(env_) + "/trash"; - int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec - Status s; - options.sst_file_manager.reset(NewSstFileManager( - env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s)); - ASSERT_OK(s); - ASSERT_OK(DestroyDB(dbname_, options)); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); - auto sfm = static_cast(options.sst_file_manager.get()); - sfm->WaitForEmptyTrash(); - // We have deleted the 4 sst files in the delete_scheduler - ASSERT_EQ(bg_delete_file, 4); + ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); } -#endif // ROCKSDB_LITE - -TEST_F(DBTest, DBWithMaxSpaceAllowed) { - std::shared_ptr sst_file_manager(NewSstFileManager(env_)); - auto sfm = static_cast(sst_file_manager.get()); +TEST_P(DBTestWithParam, FilterCompactionTimeTest) { Options options = CurrentOptions(); - options.sst_file_manager = sst_file_manager; + options.compaction_filter_factory = + std::make_shared(this); options.disable_auto_compactions = true; + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + options.max_subcompactions = max_subcompactions_; DestroyAndReopen(options); - Random rnd(301); - - // Generate a file containing 100 keys. - for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + // put some data + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(ToString(table * 100 + i), "val"); + } + Flush(); } - ASSERT_OK(Flush()); - uint64_t first_file_size = 0; - auto files_in_db = GetAllSSTFiles(&first_file_size); - ASSERT_EQ(sfm->GetTotalSize(), first_file_size); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ(0U, CountLiveFiles()); - // Set the maximum allowed space usage to the current total size - sfm->SetMaxAllowedSpaceUsage(first_file_size + 1); + Reopen(options); - ASSERT_OK(Put("key1", "val1")); - // This flush will cause bg_error_ and will fail - ASSERT_NOK(Flush()); + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0); + delete itr; } +#endif // ROCKSDB_LITE -TEST_F(DBTest, DBWithMaxSpaceAllowedRandomized) { - // This test will set a maximum allowed space for the DB, then it will - // keep filling the DB until the limit is reached and bg_error_ is set. - // When bg_error_ is set we will verify that the DB size is greater - // than the limit. - - std::vector max_space_limits_mbs = {1, 2, 4, 8, 10}; - - bool bg_error_set = false; - uint64_t total_sst_files_size = 0; - - int reached_max_space_on_flush = 0; - int reached_max_space_on_compaction = 0; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", - [&](void* arg) { - bg_error_set = true; - GetAllSSTFiles(&total_sst_files_size); - reached_max_space_on_flush++; - }); - - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached", - [&](void* arg) { - bg_error_set = true; - GetAllSSTFiles(&total_sst_files_size); - reached_max_space_on_compaction++; - }); - - for (auto limit_mb : max_space_limits_mbs) { - bg_error_set = false; - total_sst_files_size = 0; - rocksdb::SyncPoint::GetInstance()->ClearTrace(); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - std::shared_ptr sst_file_manager(NewSstFileManager(env_)); - auto sfm = static_cast(sst_file_manager.get()); - - Options options = CurrentOptions(); - options.sst_file_manager = sst_file_manager; - options.write_buffer_size = 1024 * 512; // 512 Kb - DestroyAndReopen(options); - Random rnd(301); - - sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024); +TEST_F(DBTest, TestLogCleanup) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; // very small + // only two memtables allowed ==> only two log files + options.max_write_buffer_number = 2; + Reopen(options); - int keys_written = 0; - uint64_t estimated_db_size = 0; - while (true) { - auto s = Put(RandomString(&rnd, 10), RandomString(&rnd, 50)); - if (!s.ok()) { - break; - } - keys_written++; - // Check the estimated db size vs the db limit just to make sure we - // dont run into an infinite loop - estimated_db_size = keys_written * 60; // ~60 bytes per key - ASSERT_LT(estimated_db_size, limit_mb * 1024 * 1024 * 2); - } - ASSERT_TRUE(bg_error_set); - ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + for (int i = 0; i < 100000; ++i) { + Put(Key(i), "val"); + // only 2 memtables will be alive, so logs_to_free needs to always be below + // 2 + ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast(3)); } - - ASSERT_GT(reached_max_space_on_flush, 0); - ASSERT_GT(reached_max_space_on_compaction, 0); } -TEST_F(DBTest, UnsupportedManualSync) { - DestroyAndReopen(CurrentOptions()); - env_->is_wal_sync_thread_safe_.store(false); - Status s = db_->SyncWAL(); +#ifndef ROCKSDB_LITE +TEST_F(DBTest, EmptyCompactedDB) { + Options options = CurrentOptions(); + options.max_open_files = -1; + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + Status s = Put("new", "value"); ASSERT_TRUE(s.IsNotSupported()); + Close(); } +#endif // ROCKSDB_LITE #ifndef ROCKSDB_LITE -TEST_F(DBTest, OpenDBWithInfiniteMaxOpenFiles) { - // Open DB with infinite max open files - // - First iteration use 1 thread to open files - // - Second iteration use 5 threads to open files - for (int iter = 0; iter < 2; iter++) { - Options options; - options.create_if_missing = true; - options.write_buffer_size = 100000; - options.disable_auto_compactions = true; - options.max_open_files = -1; - if (iter == 0) { - options.max_file_opening_threads = 1; - } else { - options.max_file_opening_threads = 5; - } - options = CurrentOptions(options); - DestroyAndReopen(options); - - // Create 12 Files in L0 (then move then to L2) - for (int i = 0; i < 12; i++) { - std::string k = "L2_" + Key(i); - ASSERT_OK(Put(k, k + std::string(1000, 'a'))); - ASSERT_OK(Flush()); - } - CompactRangeOptions compact_options; - compact_options.change_level = true; - compact_options.target_level = 2; - db_->CompactRange(compact_options, nullptr, nullptr); - - // Create 12 Files in L0 - for (int i = 0; i < 12; i++) { - std::string k = "L0_" + Key(i); - ASSERT_OK(Put(k, k + std::string(1000, 'a'))); - ASSERT_OK(Flush()); +TEST_F(DBTest, SuggestCompactRangeTest) { + class CompactionFilterFactoryGetContext : public CompactionFilterFactory { + public: + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + saved_context = context; + std::unique_ptr empty_filter; + return empty_filter; } - Close(); - - // Reopening the DB will load all exisitng files - Reopen(options); - ASSERT_EQ("12,0,12", FilesPerLevel(0)); - std::vector> files; - dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); - - for (const auto& level : files) { - for (const auto& file : level) { - ASSERT_TRUE(file.table_reader_handle != nullptr); - } + const char* Name() const override { + return "CompactionFilterFactoryGetContext"; } - - for (int i = 0; i < 12; i++) { - ASSERT_EQ(Get("L0_" + Key(i)), "L0_" + Key(i) + std::string(1000, 'a')); - ASSERT_EQ(Get("L2_" + Key(i)), "L2_" + Key(i) + std::string(1000, 'a')); + static bool IsManual(CompactionFilterFactory* compaction_filter_factory) { + return reinterpret_cast( + compaction_filter_factory) + ->saved_context.is_manual_compaction; } - } -} + CompactionFilter::Context saved_context; + }; -TEST_F(DBTest, GetTotalSstFilesSize) { Options options = CurrentOptions(); - options.disable_auto_compactions = true; + options.memtable_factory.reset( + new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + options.compaction_style = kCompactionStyleLevel; + options.compaction_filter_factory.reset( + new CompactionFilterFactoryGetContext()); + options.write_buffer_size = 200 << 10; + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; options.compression = kNoCompression; - DestroyAndReopen(options); - // Generate 5 files in L0 - for (int i = 0; i < 5; i++) { - for (int j = 0; j < 10; j++) { - std::string val = "val_file_" + ToString(i); - ASSERT_OK(Put(Key(j), val)); - } - Flush(); - } - ASSERT_EQ("5", FilesPerLevel(0)); - - std::vector live_files_meta; - dbfull()->GetLiveFilesMetaData(&live_files_meta); - ASSERT_EQ(live_files_meta.size(), 5); - uint64_t single_file_size = live_files_meta[0].size; - - uint64_t live_sst_files_size = 0; - uint64_t total_sst_files_size = 0; - for (const auto& file_meta : live_files_meta) { - live_sst_files_size += file_meta.size; - } - - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 5 - // Total SST files = 5 - ASSERT_EQ(live_sst_files_size, 5 * single_file_size); - ASSERT_EQ(total_sst_files_size, 5 * single_file_size); - - // hold current version - std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 98 << 10; + options.max_grandparent_overlap_factor = 1 << 20; // inf - // Compact 5 files into 1 file in L0 - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ("0,1", FilesPerLevel(0)); + Reopen(options); - live_files_meta.clear(); - dbfull()->GetLiveFilesMetaData(&live_files_meta); - ASSERT_EQ(live_files_meta.size(), 1); + Random rnd(301); - live_sst_files_size = 0; - total_sst_files_size = 0; - for (const auto& file_meta : live_files_meta) { - live_sst_files_size += file_meta.size; + for (int num = 0; num < 3; num++) { + GenerateNewRandomFile(&rnd); } - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 1 (compacted file) - // Total SST files = 6 (5 original files + compacted file) - ASSERT_EQ(live_sst_files_size, 1 * single_file_size); - ASSERT_EQ(total_sst_files_size, 6 * single_file_size); - // hold current version - std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("0,4", FilesPerLevel(0)); + ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual( + options.compaction_filter_factory.get())); - // Delete all keys and compact, this will delete all live files - for (int i = 0; i < 10; i++) { - ASSERT_OK(Delete(Key(i))); - } - Flush(); - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ("", FilesPerLevel(0)); - - live_files_meta.clear(); - dbfull()->GetLiveFilesMetaData(&live_files_meta); - ASSERT_EQ(live_files_meta.size(), 0); - - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 0 - // Total SST files = 6 (5 original files + compacted file) - ASSERT_EQ(total_sst_files_size, 6 * single_file_size); - - iter1.reset(); - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 0 - // Total SST files = 1 (compacted file) - ASSERT_EQ(total_sst_files_size, 1 * single_file_size); - - iter2.reset(); - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 0 - // Total SST files = 0 - ASSERT_EQ(total_sst_files_size, 0); -} - -TEST_F(DBTest, GetTotalSstFilesSizeVersionsFilesShared) { - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.compression = kNoCompression; - DestroyAndReopen(options); - // Generate 5 files in L0 - for (int i = 0; i < 5; i++) { - ASSERT_OK(Put(Key(i), "val")); - Flush(); - } - ASSERT_EQ("5", FilesPerLevel(0)); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("1,4", FilesPerLevel(0)); - std::vector live_files_meta; - dbfull()->GetLiveFilesMetaData(&live_files_meta); - ASSERT_EQ(live_files_meta.size(), 5); - uint64_t single_file_size = live_files_meta[0].size; + GenerateNewRandomFile(&rnd); + ASSERT_EQ("2,4", FilesPerLevel(0)); - uint64_t live_sst_files_size = 0; - uint64_t total_sst_files_size = 0; - for (const auto& file_meta : live_files_meta) { - live_sst_files_size += file_meta.size; - } + GenerateNewRandomFile(&rnd); + ASSERT_EQ("3,4", FilesPerLevel(0)); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ("0,4,4", FilesPerLevel(0)); - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("1,4,4", FilesPerLevel(0)); - // Live SST files = 5 - // Total SST files = 5 - ASSERT_EQ(live_sst_files_size, 5 * single_file_size); - ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("2,4,4", FilesPerLevel(0)); - // hold current version - std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("3,4,4", FilesPerLevel(0)); - // Compaction will do trivial move from L0 to L1 - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ("0,5", FilesPerLevel(0)); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("0,4,8", FilesPerLevel(0)); - live_files_meta.clear(); - dbfull()->GetLiveFilesMetaData(&live_files_meta); - ASSERT_EQ(live_files_meta.size(), 5); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("1,4,8", FilesPerLevel(0)); - live_sst_files_size = 0; - total_sst_files_size = 0; - for (const auto& file_meta : live_files_meta) { - live_sst_files_size += file_meta.size; + // compact it three times + for (int i = 0; i < 3; ++i) { + ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); + dbfull()->TEST_WaitForCompact(); } - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 5 - // Total SST files = 5 (used in 2 version) - ASSERT_EQ(live_sst_files_size, 5 * single_file_size); - ASSERT_EQ(total_sst_files_size, 5 * single_file_size); - // hold current version - std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + // All files are compacted + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); - // Delete all keys and compact, this will delete all live files - for (int i = 0; i < 5; i++) { - ASSERT_OK(Delete(Key(i))); - } - Flush(); - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ("", FilesPerLevel(0)); + GenerateNewRandomFile(&rnd); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); - live_files_meta.clear(); - dbfull()->GetLiveFilesMetaData(&live_files_meta); - ASSERT_EQ(live_files_meta.size(), 0); + // nonoverlapping with the file on level 0 + Slice start("a"), end("b"); + ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + dbfull()->TEST_WaitForCompact(); - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 0 - // Total SST files = 5 (used in 2 version) - ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + // should not compact the level 0 file + ASSERT_EQ(1, NumTableFilesAtLevel(0)); - iter1.reset(); - iter2.reset(); + start = Slice("j"); + end = Slice("m"); + ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(CompactionFilterFactoryGetContext::IsManual( + options.compaction_filter_factory.get())); - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 0 - // Total SST files = 0 - ASSERT_EQ(total_sst_files_size, 0); + // now it should compact the level 0 file + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); } -TEST_F(DBTest, AddExternalSstFile) { - do { - std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; - env_->CreateDir(sst_files_folder); - Options options = CurrentOptions(); - options.env = env_; - const ImmutableCFOptions ioptions(options); - - SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); - - // file1.sst (0 => 99) - std::string file1 = sst_files_folder + "file1.sst"; - ASSERT_OK(sst_file_writer.Open(file1)); - for (int k = 0; k < 100; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); - } - ExternalSstFileInfo file1_info; - Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file1_info.file_path, file1); - ASSERT_EQ(file1_info.num_entries, 100); - ASSERT_EQ(file1_info.smallest_key, Key(0)); - ASSERT_EQ(file1_info.largest_key, Key(99)); - // sst_file_writer already finished, cannot add this value - s = sst_file_writer.Add(Key(100), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); - - // file2.sst (100 => 199) - std::string file2 = sst_files_folder + "file2.sst"; - ASSERT_OK(sst_file_writer.Open(file2)); - for (int k = 100; k < 200; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); - } - // Cannot add this key because it's not after last added key - s = sst_file_writer.Add(Key(99), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); - ExternalSstFileInfo file2_info; - s = sst_file_writer.Finish(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file2_info.file_path, file2); - ASSERT_EQ(file2_info.num_entries, 100); - ASSERT_EQ(file2_info.smallest_key, Key(100)); - ASSERT_EQ(file2_info.largest_key, Key(199)); - - // file3.sst (195 => 299) - // This file values overlap with file2 values - std::string file3 = sst_files_folder + "file3.sst"; - ASSERT_OK(sst_file_writer.Open(file3)); - for (int k = 195; k < 300; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); - } - ExternalSstFileInfo file3_info; - s = sst_file_writer.Finish(&file3_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file3_info.file_path, file3); - ASSERT_EQ(file3_info.num_entries, 105); - ASSERT_EQ(file3_info.smallest_key, Key(195)); - ASSERT_EQ(file3_info.largest_key, Key(299)); - - // file4.sst (30 => 39) - // This file values overlap with file1 values - std::string file4 = sst_files_folder + "file4.sst"; - ASSERT_OK(sst_file_writer.Open(file4)); - for (int k = 30; k < 40; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); - } - ExternalSstFileInfo file4_info; - s = sst_file_writer.Finish(&file4_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file4_info.file_path, file4); - ASSERT_EQ(file4_info.num_entries, 10); - ASSERT_EQ(file4_info.smallest_key, Key(30)); - ASSERT_EQ(file4_info.largest_key, Key(39)); - - // file5.sst (400 => 499) - std::string file5 = sst_files_folder + "file5.sst"; - ASSERT_OK(sst_file_writer.Open(file5)); - for (int k = 400; k < 500; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); - } - ExternalSstFileInfo file5_info; - s = sst_file_writer.Finish(&file5_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file5_info.file_path, file5); - ASSERT_EQ(file5_info.num_entries, 100); - ASSERT_EQ(file5_info.smallest_key, Key(400)); - ASSERT_EQ(file5_info.largest_key, Key(499)); - - // Cannot create an empty sst file - std::string file_empty = sst_files_folder + "file_empty.sst"; - ExternalSstFileInfo file_empty_info; - s = sst_file_writer.Finish(&file_empty_info); - ASSERT_NOK(s); +TEST_F(DBTest, PromoteL0) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + DestroyAndReopen(options); - DestroyAndReopen(options); - // Add file using file path - s = db_->AddFile(file1); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); - for (int k = 0; k < 100; k++) { - ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); - } + // non overlapping ranges + std::vector> ranges = { + {81, 160}, {0, 80}, {161, 240}, {241, 320}}; - // Add file while holding a snapshot will fail - const Snapshot* s1 = db_->GetSnapshot(); - if (s1 != nullptr) { - ASSERT_NOK(db_->AddFile(&file2_info)); - db_->ReleaseSnapshot(s1); - } - // We can add the file after releaseing the snapshot - ASSERT_OK(db_->AddFile(&file2_info)); + int32_t value_size = 10 * 1024; // 10 KB - ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); - for (int k = 0; k < 200; k++) { - ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + Random rnd(301); + std::map values; + for (const auto& range : ranges) { + for (int32_t j = range.first; j < range.second; j++) { + values[j] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(j), values[j])); } + ASSERT_OK(Flush()); + } - // This file have overlapping values with the exisitng data - s = db_->AddFile(file3); - ASSERT_FALSE(s.ok()) << s.ToString(); - - // This file have overlapping values with the exisitng data - s = db_->AddFile(&file4_info); - ASSERT_FALSE(s.ok()) << s.ToString(); - - // Overwrite values of keys divisible by 5 - for (int k = 0; k < 200; k += 5) { - ASSERT_OK(Put(Key(k), Key(k) + "_val_new")); - } - ASSERT_NE(db_->GetLatestSequenceNumber(), 0U); + int32_t level0_files = NumTableFilesAtLevel(0, 0); + ASSERT_EQ(level0_files, ranges.size()); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1 - // Key range of file5 (400 => 499) dont overlap with any keys in DB - ASSERT_OK(db_->AddFile(file5)); + // Promote L0 level to L2. + ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2)); + // We expect that all the files were trivially moved from L0 to L2 + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files); - // Make sure values are correct before and after flush/compaction - for (int i = 0; i < 2; i++) { - for (int k = 0; k < 200; k++) { - std::string value = Key(k) + "_val"; - if (k % 5 == 0) { - value += "_new"; - } - ASSERT_EQ(Get(Key(k)), value); - } - for (int k = 400; k < 500; k++) { - std::string value = Key(k) + "_val"; - ASSERT_EQ(Get(Key(k)), value); - } - ASSERT_OK(Flush()); - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - } + for (const auto& kv : values) { + ASSERT_EQ(Get(Key(kv.first)), kv.second); + } +} - Close(); - options.disable_auto_compactions = true; - Reopen(options); +TEST_F(DBTest, PromoteL0Failure) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + DestroyAndReopen(options); - // Delete keys in range (400 => 499) - for (int k = 400; k < 500; k++) { - ASSERT_OK(Delete(Key(k))); - } - // We deleted range (400 => 499) but cannot add file5 because - // of the range tombstones - ASSERT_NOK(db_->AddFile(file5)); + // Produce two L0 files with overlapping ranges. + ASSERT_OK(Put(Key(0), "")); + ASSERT_OK(Put(Key(3), "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "")); + ASSERT_OK(Flush()); - // Compacting the DB will remove the tombstones - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + Status status; + // Fails because L0 has overlapping files. + status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); + ASSERT_TRUE(status.IsInvalidArgument()); - // Now we can add the file - ASSERT_OK(db_->AddFile(file5)); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Now there is a file in L1. + ASSERT_GE(NumTableFilesAtLevel(1, 0), 1); - // Verify values of file5 in DB - for (int k = 400; k < 500; k++) { - std::string value = Key(k) + "_val"; - ASSERT_EQ(Get(Key(k)), value); - } - } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction | - kSkipFIFOCompaction)); + ASSERT_OK(Put(Key(5), "")); + ASSERT_OK(Flush()); + // Fails because L1 is non-empty. + status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); + ASSERT_TRUE(status.IsInvalidArgument()); } +#endif // ROCKSDB_LITE -// This test reporduce a bug that can happen in some cases if the DB started -// purging obsolete files when we are adding an external sst file. -// This situation may result in deleting the file while it's being added. -TEST_F(DBTest, AddExternalSstFilePurgeObsoleteFilesBug) { - std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; - env_->CreateDir(sst_files_folder); +// Github issue #596 +TEST_F(DBTest, HugeNumberOfLevels) { Options options = CurrentOptions(); - options.env = env_; - const ImmutableCFOptions ioptions(options); - - SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); + options.write_buffer_size = 2 * 1024 * 1024; // 2MB + options.max_bytes_for_level_base = 2 * 1024 * 1024; // 2MB + options.num_levels = 12; + options.max_background_compactions = 10; + options.max_bytes_for_level_multiplier = 2; + options.level_compaction_dynamic_level_bytes = true; + DestroyAndReopen(options); - // file1.sst (0 => 500) - std::string sst_file_path = sst_files_folder + "file1.sst"; - Status s = sst_file_writer.Open(sst_file_path); - ASSERT_OK(s); - for (int i = 0; i < 500; i++) { - std::string k = Key(i); - s = sst_file_writer.Add(k, k + "_val"); - ASSERT_OK(s); + Random rnd(301); + for (int i = 0; i < 300000; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); } - ExternalSstFileInfo sst_file_info; - s = sst_file_writer.Finish(&sst_file_info); - ASSERT_OK(s); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); +} - options.delete_obsolete_files_period_micros = 0; - options.disable_auto_compactions = true; +TEST_F(DBTest, AutomaticConflictsWithManualCompaction) { + Options options = CurrentOptions(); + options.write_buffer_size = 2 * 1024 * 1024; // 2MB + options.max_bytes_for_level_base = 2 * 1024 * 1024; // 2MB + options.num_levels = 12; + options.max_background_compactions = 10; + options.max_bytes_for_level_multiplier = 2; + options.level_compaction_dynamic_level_bytes = true; DestroyAndReopen(options); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::AddFile:FileCopied", [&](void* arg) { - ASSERT_OK(Put("aaa", "bbb")); - ASSERT_OK(Flush()); - ASSERT_OK(Put("aaa", "xxx")); - ASSERT_OK(Flush()); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - - s = db_->AddFile(sst_file_path); - ASSERT_OK(s); - - for (int i = 0; i < 500; i++) { - std::string k = Key(i); - std::string v = k + "_val"; - ASSERT_EQ(Get(k), v); + Random rnd(301); + for (int i = 0; i < 300000; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); } + std::atomic callback_count(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction()::Conflict", + [&](void* arg) { callback_count.fetch_add(1); }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + CompactRangeOptions croptions; + croptions.exclusive_manual_compaction = false; + ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr)); + ASSERT_GE(callback_count.load(), 1); rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + for (int i = 0; i < 300000; ++i) { + ASSERT_NE("NOT_FOUND", Get(Key(i))); + } } -TEST_F(DBTest, AddExternalSstFileNoCopy) { - std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; - env_->CreateDir(sst_files_folder); +// Github issue #595 +// Large write batch with column families +TEST_F(DBTest, LargeBatchWithColumnFamilies) { Options options = CurrentOptions(); options.env = env_; - const ImmutableCFOptions ioptions(options); - - SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); - - // file1.sst (0 => 99) - std::string file1 = sst_files_folder + "file1.sst"; - ASSERT_OK(sst_file_writer.Open(file1)); - for (int k = 0; k < 100; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); - } - ExternalSstFileInfo file1_info; - Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file1_info.file_path, file1); - ASSERT_EQ(file1_info.num_entries, 100); - ASSERT_EQ(file1_info.smallest_key, Key(0)); - ASSERT_EQ(file1_info.largest_key, Key(99)); - - // file2.sst (100 => 299) - std::string file2 = sst_files_folder + "file2.sst"; - ASSERT_OK(sst_file_writer.Open(file2)); - for (int k = 100; k < 300; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); - } - ExternalSstFileInfo file2_info; - s = sst_file_writer.Finish(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file2_info.file_path, file2); - ASSERT_EQ(file2_info.num_entries, 200); - ASSERT_EQ(file2_info.smallest_key, Key(100)); - ASSERT_EQ(file2_info.largest_key, Key(299)); - - // file3.sst (110 => 124) .. overlap with file2.sst - std::string file3 = sst_files_folder + "file3.sst"; - ASSERT_OK(sst_file_writer.Open(file3)); - for (int k = 110; k < 125; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); - } - ExternalSstFileInfo file3_info; - s = sst_file_writer.Finish(&file3_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file3_info.file_path, file3); - ASSERT_EQ(file3_info.num_entries, 15); - ASSERT_EQ(file3_info.smallest_key, Key(110)); - ASSERT_EQ(file3_info.largest_key, Key(124)); - - s = db_->AddFile(&file1_info, true /* move file */); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(Status::NotFound(), env_->FileExists(file1)); - - s = db_->AddFile(&file2_info, false /* copy file */); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_OK(env_->FileExists(file2)); - - // This file have overlapping values with the exisitng data - s = db_->AddFile(&file3_info, true /* move file */); - ASSERT_FALSE(s.ok()) << s.ToString(); - ASSERT_OK(env_->FileExists(file3)); - - for (int k = 0; k < 300; k++) { - ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); - } -} - -TEST_F(DBTest, AddExternalSstFileMultiThreaded) { - std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; - // Bulk load 10 files every file contain 1000 keys - int num_files = 10; - int keys_per_file = 1000; - - // Generate file names - std::vector file_names; - for (int i = 0; i < num_files; i++) { - std::string file_name = "file_" + ToString(i) + ".sst"; - file_names.push_back(sst_files_folder + file_name); - } - - do { - env_->CreateDir(sst_files_folder); - Options options = CurrentOptions(); - const ImmutableCFOptions ioptions(options); - - std::atomic thread_num(0); - std::function write_file_func = [&]() { - int file_idx = thread_num.fetch_add(1); - int range_start = file_idx * keys_per_file; - int range_end = range_start + keys_per_file; - - SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); - - ASSERT_OK(sst_file_writer.Open(file_names[file_idx])); - - for (int k = range_start; k < range_end; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k))); + options.write_buffer_size = 100000; // Small write buffer + CreateAndReopenWithCF({"pikachu"}, options); + int64_t j = 0; + for (int i = 0; i < 5; i++) { + for (int pass = 1; pass <= 3; pass++) { + WriteBatch batch; + size_t write_size = 1024 * 1024 * (5 + i); + fprintf(stderr, "prepare: %" ROCKSDB_PRIszt " MB, pass:%d\n", + (write_size / 1024 / 1024), pass); + for (;;) { + std::string data(3000, j++ % 127 + 20); + data += ToString(j); + batch.Put(handles_[0], Slice(data), Slice(data)); + if (batch.GetDataSize() > write_size) { + break; + } } - - Status s = sst_file_writer.Finish(); - ASSERT_TRUE(s.ok()) << s.ToString(); - }; - // Write num_files files in parallel - std::vector sst_writer_threads; - for (int i = 0; i < num_files; ++i) { - sst_writer_threads.emplace_back(write_file_func); + fprintf(stderr, "write: %" ROCKSDB_PRIszt " MB\n", + (batch.GetDataSize() / 1024 / 1024)); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + fprintf(stderr, "done\n"); } + } + // make sure we can re-open it. + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); +} - for (auto& t : sst_writer_threads) { - t.join(); +// Make sure that Flushes can proceed in parallel with CompactRange() +TEST_F(DBTest, FlushesInParallelWithCompactRange) { + // iter == 0 -- leveled + // iter == 1 -- leveled, but throw in a flush between two levels compacting + // iter == 2 -- universal + for (int iter = 0; iter < 3; ++iter) { + Options options = CurrentOptions(); + if (iter < 2) { + options.compaction_style = kCompactionStyleLevel; + } else { + options.compaction_style = kCompactionStyleUniversal; } + options.write_buffer_size = 110 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 98 << 10; + options.max_write_buffer_number = 2; - fprintf(stderr, "Wrote %d files (%d keys)\n", num_files, - num_files * keys_per_file); - - thread_num.store(0); - std::atomic files_added(0); - std::function load_file_func = [&]() { - // We intentionally add every file twice, and assert that it was added - // only once and the other add failed - int thread_id = thread_num.fetch_add(1); - int file_idx = thread_id / 2; - // sometimes we use copy, sometimes link .. the result should be the same - bool move_file = (thread_id % 3 == 0); - - Status s = db_->AddFile(file_names[file_idx], move_file); - if (s.ok()) { - files_added++; - } - }; - // Bulk load num_files files in parallel - std::vector add_file_threads; DestroyAndReopen(options); - for (int i = 0; i < num_files * 2; ++i) { - add_file_threads.emplace_back(load_file_func); - } - - for (auto& t : add_file_threads) { - t.join(); - } - ASSERT_EQ(files_added.load(), num_files); - fprintf(stderr, "Loaded %d files (%d keys)\n", num_files, - num_files * keys_per_file); - - // Overwrite values of keys divisible by 100 - for (int k = 0; k < num_files * keys_per_file; k += 100) { - std::string key = Key(k); - Status s = Put(key, key + "_new"); - ASSERT_TRUE(s.ok()); - } - for (int i = 0; i < 2; i++) { - // Make sure the values are correct before and after flush/compaction - for (int k = 0; k < num_files * keys_per_file; ++k) { - std::string key = Key(k); - std::string value = (k % 100 == 0) ? (key + "_new") : key; - ASSERT_EQ(Get(key), value); - } - ASSERT_OK(Flush()); - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + Random rnd(301); + for (int num = 0; num < 14; num++) { + GenerateNewRandomFile(&rnd); } - fprintf(stderr, "Verified %d values\n", num_files * keys_per_file); - } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction | - kSkipFIFOCompaction)); -} - -TEST_F(DBTest, AddExternalSstFileOverlappingRanges) { - std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; - Random rnd(301); - do { - env_->CreateDir(sst_files_folder); - Options options = CurrentOptions(); - DestroyAndReopen(options); - const ImmutableCFOptions ioptions(options); - SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); - - printf("Option config = %d\n", option_config_); - std::vector> key_ranges; - for (int i = 0; i < 500; i++) { - int range_start = rnd.Uniform(20000); - int keys_per_range = 10 + rnd.Uniform(41); - - key_ranges.emplace_back(range_start, range_start + keys_per_range); + if (iter == 1) { + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::RunManualCompaction()::1", + "DBTest::FlushesInParallelWithCompactRange:1"}, + {"DBTest::FlushesInParallelWithCompactRange:2", + "DBImpl::RunManualCompaction()::2"}}); + } else { + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"CompactionJob::Run():Start", + "DBTest::FlushesInParallelWithCompactRange:1"}, + {"DBTest::FlushesInParallelWithCompactRange:2", + "CompactionJob::Run():End"}}); } + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - int memtable_add = 0; - int success_add_file = 0; - int failed_add_file = 0; - std::map true_data; - for (size_t i = 0; i < key_ranges.size(); i++) { - int range_start = key_ranges[i].first; - int range_end = key_ranges[i].second; - - Status s; - std::string range_val = "range_" + ToString(i); - - // For 20% of ranges we use DB::Put, for 80% we use DB::AddFile - if (i && i % 5 == 0) { - // Use DB::Put to insert range (insert into memtable) - range_val += "_put"; - for (int k = range_start; k <= range_end; k++) { - s = Put(Key(k), range_val); - ASSERT_OK(s); - } - memtable_add++; - } else { - // Use DB::AddFile to insert range - range_val += "_add_file"; - - // Generate the file containing the range - std::string file_name = sst_files_folder + env_->GenerateUniqueId(); - ASSERT_OK(sst_file_writer.Open(file_name)); - for (int k = range_start; k <= range_end; k++) { - s = sst_file_writer.Add(Key(k), range_val); - ASSERT_OK(s); - } - ExternalSstFileInfo file_info; - s = sst_file_writer.Finish(&file_info); - ASSERT_OK(s); - - // Insert the generated file - s = db_->AddFile(&file_info); - - auto it = true_data.lower_bound(Key(range_start)); - if (it != true_data.end() && it->first <= Key(range_end)) { - // This range overlap with data already exist in DB - ASSERT_NOK(s); - failed_add_file++; - } else { - ASSERT_OK(s); - success_add_file++; - } - } - - if (s.ok()) { - // Update true_data map to include the new inserted data - for (int k = range_start; k <= range_end; k++) { - true_data[Key(k)] = range_val; - } - } + std::vector threads; + threads.emplace_back([&]() { Compact("a", "z"); }); - // Flush / Compact the DB - if (i && i % 50 == 0) { - Flush(); - } - if (i && i % 75 == 0) { - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); - } + TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1"); + + // this has to start a flush. if flushes are blocked, this will try to + // create + // 3 memtables, and that will fail because max_write_buffer_number is 2 + for (int num = 0; num < 3; num++) { + GenerateNewRandomFile(&rnd, /* nowait */ true); } - printf( - "Total: %zu ranges\n" - "AddFile()|Success: %d ranges\n" - "AddFile()|RangeConflict: %d ranges\n" - "Put(): %d ranges\n", - key_ranges.size(), success_add_file, failed_add_file, memtable_add); + TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2"); - // Verify the correctness of the data - for (const auto& kv : true_data) { - ASSERT_EQ(Get(kv.first), kv.second); + for (auto& t : threads) { + t.join(); } - printf("keys/values verified\n"); - } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction | - kSkipFIFOCompaction)); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } } -#endif // ROCKSDB_LITE +TEST_F(DBTest, DelayedWriteRate) { + const int kEntriesPerMemTable = 100; + const int kTotalFlushes = 20; -// 1 Create some SST files by inserting K-V pairs into DB -// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file -// 3 Open DB and check if all key can be read -TEST_F(DBTest, SSTsWithLdbSuffixHandling) { Options options = CurrentOptions(); - options.write_buffer_size = 110 << 10; // 110KB - options.num_levels = 4; - DestroyAndReopen(options); + env_->SetBackgroundThreads(1, Env::LOW); + options.env = env_; + env_->no_sleep_ = true; + options.write_buffer_size = 100000000; + options.max_write_buffer_number = 256; + options.max_background_compactions = 1; + options.level0_file_num_compaction_trigger = 3; + options.level0_slowdown_writes_trigger = 3; + options.level0_stop_writes_trigger = 999999; + options.delayed_write_rate = 20000000; // Start with 200MB/s + options.memtable_factory.reset( + new SpecialSkipListFactory(kEntriesPerMemTable)); - Random rnd(301); - int key_id = 0; - for (int i = 0; i < 10; ++i) { - GenerateNewFile(&rnd, &key_id, false); + CreateAndReopenWithCF({"pikachu"}, options); + + // Block compactions + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + for (int i = 0; i < 3; i++) { + Put(Key(i), std::string(10000, 'x')); + Flush(); } - Flush(); - Close(); - int const num_files = GetSstFileCount(dbname_); - ASSERT_GT(num_files, 0); - std::vector filenames; - GetSstFiles(dbname_, &filenames); - int num_ldb_files = 0; - for (size_t i = 0; i < filenames.size(); ++i) { - if (i & 1) { - continue; + // These writes will be slowed down to 1KB/s + uint64_t estimated_sleep_time = 0; + Random rnd(301); + Put("", ""); + uint64_t cur_rate = options.delayed_write_rate; + for (int i = 0; i < kTotalFlushes; i++) { + uint64_t size_memtable = 0; + for (int j = 0; j < kEntriesPerMemTable; j++) { + auto rand_num = rnd.Uniform(20); + // Spread the size range to more. + size_t entry_size = rand_num * rand_num * rand_num; + WriteOptions wo; + Put(Key(i), std::string(entry_size, 'x'), wo); + size_memtable += entry_size + 18; + // Occasionally sleep a while + if (rnd.Uniform(20) == 6) { + env_->SleepForMicroseconds(2666); + } } - std::string const rdb_name = dbname_ + "/" + filenames[i]; - std::string const ldb_name = Rocks2LevelTableFileName(rdb_name); - ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok()); - ++num_ldb_files; + dbfull()->TEST_WaitForFlushMemTable(); + estimated_sleep_time += size_memtable * 1000000u / cur_rate; + // Slow down twice. One for memtable switch and one for flush finishes. + cur_rate = static_cast(static_cast(cur_rate) / + kSlowdownRatio / kSlowdownRatio); } - ASSERT_GT(num_ldb_files, 0); - ASSERT_EQ(num_files, GetSstFileCount(dbname_)); + // Estimate the total sleep time fall into the rough range. + ASSERT_GT(env_->addon_time_.load(), + static_cast(estimated_sleep_time / 2)); + ASSERT_LT(env_->addon_time_.load(), + static_cast(estimated_sleep_time * 2)); - Reopen(options); - for (int k = 0; k < key_id; ++k) { - ASSERT_NE("NOT_FOUND", Get(Key(k))); - } - Destroy(options); + env_->no_sleep_ = false; + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); } -TEST_F(DBTest, PinnedDataIteratorRandomized) { - enum TestConfig { - NORMAL, - CLOSE_AND_OPEN, - COMPACT_BEFORE_READ, - FLUSH_EVERY_1000, - MAX - }; - - // Generate Random data - Random rnd(301); +TEST_F(DBTest, HardLimit) { + Options options = CurrentOptions(); + options.env = env_; + env_->SetBackgroundThreads(1, Env::LOW); + options.max_write_buffer_number = 256; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 * 1024; + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 999999; + options.level0_stop_writes_trigger = 999999; + options.hard_pending_compaction_bytes_limit = 800 << 10; + options.max_bytes_for_level_base = 10000000000u; + options.max_background_compactions = 1; + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); - int puts = 100000; - int key_pool = static_cast(puts * 0.7); - int key_size = 100; - int val_size = 1000; - int seeks_percentage = 20; // 20% of keys will be used to test seek() - int delete_percentage = 20; // 20% of keys will be deleted - int merge_percentage = 20; // 20% of keys will be added using Merge() + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); - for (int run_config = 0; run_config < TestConfig::MAX; run_config++) { - Options options = CurrentOptions(); - BlockBasedTableOptions table_options; - table_options.use_delta_encoding = false; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.merge_operator = MergeOperators::CreatePutOperator(); - DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); - std::vector generated_keys(key_pool); - for (int i = 0; i < key_pool; i++) { - generated_keys[i] = RandomString(&rnd, key_size); - } + std::atomic callback_count(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack("DBImpl::DelayWrite:Wait", + [&](void* arg) { + callback_count.fetch_add(1); + sleeping_task_low.WakeUp(); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - std::map true_data; - std::vector random_keys; - std::vector deleted_keys; - for (int i = 0; i < puts; i++) { - auto& k = generated_keys[rnd.Next() % key_pool]; - auto v = RandomString(&rnd, val_size); - - // Insert data to true_data map and to DB - true_data[k] = v; - if (rnd.OneIn(static_cast(100.0 / merge_percentage))) { - ASSERT_OK(db_->Merge(WriteOptions(), k, v)); - } else { - ASSERT_OK(Put(k, v)); - } + Random rnd(301); + int key_idx = 0; + for (int num = 0; num < 5; num++) { + GenerateNewFile(&rnd, &key_idx, true); + dbfull()->TEST_WaitForFlushMemTable(); + } - // Pick random keys to be used to test Seek() - if (rnd.OneIn(static_cast(100.0 / seeks_percentage))) { - random_keys.push_back(k); - } + ASSERT_EQ(0, callback_count.load()); - // Delete some random keys - if (rnd.OneIn(static_cast(100.0 / delete_percentage))) { - deleted_keys.push_back(k); - true_data.erase(k); - ASSERT_OK(Delete(k)); - } + for (int num = 0; num < 5; num++) { + GenerateNewFile(&rnd, &key_idx, true); + dbfull()->TEST_WaitForFlushMemTable(); + } + ASSERT_GE(callback_count.load(), 1); - if (run_config == TestConfig::FLUSH_EVERY_1000) { - if (i && i % 1000 == 0) { - Flush(); - } - } - } + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + sleeping_task_low.WaitUntilDone(); +} - if (run_config == TestConfig::CLOSE_AND_OPEN) { - Close(); - Reopen(options); - } else if (run_config == TestConfig::COMPACT_BEFORE_READ) { - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); - } +#ifndef ROCKSDB_LITE +TEST_F(DBTest, SoftLimit) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options.max_write_buffer_number = 256; + options.level0_file_num_compaction_trigger = 1; + options.level0_slowdown_writes_trigger = 3; + options.level0_stop_writes_trigger = 999999; + options.delayed_write_rate = 20000; // About 200KB/s limited rate + options.soft_pending_compaction_bytes_limit = 200000; + options.target_file_size_base = 99999999; // All into one file + options.max_bytes_for_level_base = 50000; + options.max_bytes_for_level_multiplier = 10; + options.max_background_compactions = 1; + options.compression = kNoCompression; - ReadOptions ro; - ro.pin_data = true; - auto iter = db_->NewIterator(ro); + Reopen(options); + Put(Key(0), ""); - { - // Test Seek to random keys - printf("Testing seek on %zu keys\n", random_keys.size()); - std::vector keys_slices; - std::vector true_keys; - for (auto& k : random_keys) { - iter->Seek(k); - if (!iter->Valid()) { - ASSERT_EQ(true_data.lower_bound(k), true_data.end()); - continue; - } - std::string prop_value; - ASSERT_OK( - iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); - ASSERT_EQ("1", prop_value); - keys_slices.push_back(iter->key()); - true_keys.push_back(true_data.lower_bound(k)->first); - } + test::SleepingBackgroundTask sleeping_task_low; + // Block compactions + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); - for (size_t i = 0; i < keys_slices.size(); i++) { - ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]); - } - } + // Create 3 L0 files, making score of L0 to be 3. + for (int i = 0; i < 3; i++) { + Put(Key(i), std::string(5000, 'x')); + Put(Key(100 - i), std::string(5000, 'x')); + // Flush the file. File size is around 30KB. + Flush(); + } + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - { - // Test iterating all data forward - printf("Testing iterating forward on all keys\n"); - std::vector all_keys; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - std::string prop_value; - ASSERT_OK( - iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); - ASSERT_EQ("1", prop_value); - all_keys.push_back(iter->key()); - } - ASSERT_EQ(all_keys.size(), true_data.size()); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + sleeping_task_low.Reset(); + dbfull()->TEST_WaitForCompact(); - // Verify that all keys slices are valid - auto data_iter = true_data.begin(); - for (size_t i = 0; i < all_keys.size(); i++) { - ASSERT_EQ(all_keys[i].ToString(), data_iter->first); - data_iter++; - } - } + // Now there is one L1 file but doesn't trigger soft_rate_limit + // The L1 file size is around 30KB. + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); - { - // Test iterating all data backward - printf("Testing iterating backward on all keys\n"); - std::vector all_keys; - for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { - std::string prop_value; - ASSERT_OK( - iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); - ASSERT_EQ("1", prop_value); - all_keys.push_back(iter->key()); - } - ASSERT_EQ(all_keys.size(), true_data.size()); + // Only allow one compactin going through. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void* arg) { + // Schedule a sleeping task. + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_low, Env::Priority::LOW); + }); - // Verify that all keys slices are valid (backward) - auto data_iter = true_data.rbegin(); - for (size_t i = 0; i < all_keys.size(); i++) { - ASSERT_EQ(all_keys[i].ToString(), data_iter->first); - data_iter++; - } - } + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - delete iter; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + // Create 3 L0 files, making score of L0 to be 3 + for (int i = 0; i < 3; i++) { + Put(Key(10 + i), std::string(5000, 'x')); + Put(Key(90 - i), std::string(5000, 'x')); + // Flush the file. File size is around 30KB. + Flush(); } -} -#ifndef ROCKSDB_LITE -TEST_F(DBTest, PinnedDataIteratorMultipleFiles) { - Options options = CurrentOptions(); - BlockBasedTableOptions table_options; - table_options.use_delta_encoding = false; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.disable_auto_compactions = true; - options.write_buffer_size = 1024 * 1024 * 10; // 10 Mb - DestroyAndReopen(options); + // Wake up sleep task to enable compaction to run and waits + // for it to go to sleep state again to make sure one compaction + // goes through. + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilSleeping(); - std::map true_data; + // Now there is one L1 file (around 60KB) which exceeds 50KB base by 10KB + // Given level multiplier 10, estimated pending compaction is around 100KB + // doesn't trigger soft_pending_compaction_bytes_limit + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); - // Generate 4 sst files in L2 - Random rnd(301); - for (int i = 1; i <= 1000; i++) { - std::string k = Key(i * 3); - std::string v = RandomString(&rnd, 100); - ASSERT_OK(Put(k, v)); - true_data[k] = v; - if (i % 250 == 0) { - ASSERT_OK(Flush()); - } - } - ASSERT_EQ(FilesPerLevel(0), "4"); - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ(FilesPerLevel(0), "0,4"); - - // Generate 4 sst files in L0 - for (int i = 1; i <= 1000; i++) { - std::string k = Key(i * 2); - std::string v = RandomString(&rnd, 100); - ASSERT_OK(Put(k, v)); - true_data[k] = v; - if (i % 250 == 0) { - ASSERT_OK(Flush()); - } + // Create 3 L0 files, making score of L0 to be 3, higher than L0. + for (int i = 0; i < 3; i++) { + Put(Key(20 + i), std::string(5000, 'x')); + Put(Key(80 - i), std::string(5000, 'x')); + // Flush the file. File size is around 30KB. + Flush(); } - ASSERT_EQ(FilesPerLevel(0), "4,4"); + // Wake up sleep task to enable compaction to run and waits + // for it to go to sleep state again to make sure one compaction + // goes through. + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilSleeping(); + + // Now there is one L1 file (around 90KB) which exceeds 50KB base by 40KB + // Given level multiplier 10, estimated pending compaction is around 400KB + // triggerring soft_pending_compaction_bytes_limit + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - // Add some keys/values in memtables - for (int i = 1; i <= 1000; i++) { - std::string k = Key(i); - std::string v = RandomString(&rnd, 100); - ASSERT_OK(Put(k, v)); - true_data[k] = v; - } - ASSERT_EQ(FilesPerLevel(0), "4,4"); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilSleeping(); - ReadOptions ro; - ro.pin_data = true; - auto iter = db_->NewIterator(ro); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); - std::vector> results; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - std::string prop_value; - ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); - ASSERT_EQ("1", prop_value); - results.emplace_back(iter->key(), iter->value().ToString()); - } + // shrink level base so L2 will hit soft limit easier. + ASSERT_OK(dbfull()->SetOptions({ + {"max_bytes_for_level_base", "5000"}, + })); - ASSERT_EQ(results.size(), true_data.size()); - auto data_iter = true_data.begin(); - for (size_t i = 0; i < results.size(); i++, data_iter++) { - auto& kv = results[i]; - ASSERT_EQ(kv.first, data_iter->first); - ASSERT_EQ(kv.second, data_iter->second); - } + Put("", ""); + Flush(); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - delete iter; + sleeping_task_low.WaitUntilSleeping(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); } -#endif -TEST_F(DBTest, PinnedDataIteratorMergeOperator) { +TEST_F(DBTest, LastWriteBufferDelay) { Options options = CurrentOptions(); - BlockBasedTableOptions table_options; - table_options.use_delta_encoding = false; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.merge_operator = MergeOperators::CreateUInt64AddOperator(); - DestroyAndReopen(options); - - std::string numbers[7]; - for (int val = 0; val <= 6; val++) { - PutFixed64(numbers + val, val); - } - - // +1 all keys in range [ 0 => 999] - for (int i = 0; i < 1000; i++) { - WriteOptions wo; - ASSERT_OK(db_->Merge(wo, Key(i), numbers[1])); - } + options.env = env_; + options.write_buffer_size = 100000; + options.max_write_buffer_number = 4; + options.delayed_write_rate = 20000; + options.compression = kNoCompression; + options.disable_auto_compactions = true; + int kNumKeysPerMemtable = 3; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); - // +2 all keys divisible by 2 in range [ 0 => 999] - for (int i = 0; i < 1000; i += 2) { - WriteOptions wo; - ASSERT_OK(db_->Merge(wo, Key(i), numbers[2])); - } + Reopen(options); + test::SleepingBackgroundTask sleeping_task; + // Block flushes + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + sleeping_task.WaitUntilSleeping(); - // +3 all keys divisible by 5 in range [ 0 => 999] - for (int i = 0; i < 1000; i += 5) { - WriteOptions wo; - ASSERT_OK(db_->Merge(wo, Key(i), numbers[3])); + // Create 3 L0 files, making score of L0 to be 3. + for (int i = 0; i < 3; i++) { + // Fill one mem table + for (int j = 0; j < kNumKeysPerMemtable; j++) { + Put(Key(j), ""); + } + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); } + // Inserting a new entry would create a new mem table, triggering slow down. + Put(Key(0), ""); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - ReadOptions ro; - ro.pin_data = true; - auto iter = db_->NewIterator(ro); + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); +} +#endif // ROCKSDB_LITE - std::vector> results; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - std::string prop_value; - ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); - ASSERT_EQ("1", prop_value); - results.emplace_back(iter->key(), iter->value().ToString()); - } - - ASSERT_EQ(results.size(), 1000); - for (size_t i = 0; i < results.size(); i++) { - auto& kv = results[i]; - ASSERT_EQ(kv.first, Key(static_cast(i))); - int expected_val = 1; - if (i % 2 == 0) { - expected_val += 2; - } - if (i % 5 == 0) { - expected_val += 3; +TEST_F(DBTest, FailWhenCompressionNotSupportedTest) { + CompressionType compressions[] = {kZlibCompression, kBZip2Compression, + kLZ4Compression, kLZ4HCCompression}; + for (int iter = 0; iter < 4; ++iter) { + if (!CompressionTypeSupported(compressions[iter])) { + // not supported, we should fail the Open() + Options options = CurrentOptions(); + options.compression = compressions[iter]; + ASSERT_TRUE(!TryReopen(options).ok()); + // Try if CreateColumnFamily also fails + options.compression = kNoCompression; + ASSERT_OK(TryReopen(options)); + ColumnFamilyOptions cf_options(options); + cf_options.compression = compressions[iter]; + ColumnFamilyHandle* handle; + ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok()); } - ASSERT_EQ(kv.second, numbers[expected_val]); } - - delete iter; } -TEST_F(DBTest, PinnedDataIteratorReadAfterUpdate) { +#ifndef ROCKSDB_LITE +TEST_F(DBTest, RowCache) { Options options = CurrentOptions(); - BlockBasedTableOptions table_options; - table_options.use_delta_encoding = false; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.write_buffer_size = 100000; + options.statistics = rocksdb::CreateDBStatistics(); + options.row_cache = NewLRUCache(8192); DestroyAndReopen(options); - Random rnd(301); - - std::map true_data; - for (int i = 0; i < 1000; i++) { - std::string k = RandomString(&rnd, 10); - std::string v = RandomString(&rnd, 1000); - ASSERT_OK(Put(k, v)); - true_data[k] = v; - } + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); - ReadOptions ro; - ro.pin_data = true; - auto iter = db_->NewIterator(ro); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); + ASSERT_EQ(Get("foo"), "bar"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo"), "bar"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); +} +#endif // ROCKSDB_LITE - // Delete 50% of the keys and update the other 50% - for (auto& kv : true_data) { - if (rnd.OneIn(2)) { - ASSERT_OK(Delete(kv.first)); - } else { - std::string new_val = RandomString(&rnd, 1000); - ASSERT_OK(Put(kv.first, new_val)); - } - } +TEST_F(DBTest, DeletingOldWalAfterDrop) { + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"Test:AllowFlushes", "DBImpl::BGWorkFlush"}, + {"DBImpl::BGWorkFlush:done", "Test:WaitForFlush"}}); + rocksdb::SyncPoint::GetInstance()->ClearTrace(); - std::vector> results; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - std::string prop_value; - ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); - ASSERT_EQ("1", prop_value); - results.emplace_back(iter->key(), iter->value().ToString()); - } + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + Options options = CurrentOptions(); + options.max_total_wal_size = 8192; + options.compression = kNoCompression; + options.write_buffer_size = 1 << 20; + options.level0_file_num_compaction_trigger = (1 << 30); + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - auto data_iter = true_data.begin(); - for (size_t i = 0; i < results.size(); i++, data_iter++) { - auto& kv = results[i]; - ASSERT_EQ(kv.first, data_iter->first); - ASSERT_EQ(kv.second, data_iter->second); - } + CreateColumnFamilies({"cf1", "cf2"}, options); + ASSERT_OK(Put(0, "key1", DummyString(8192))); + ASSERT_OK(Put(0, "key2", DummyString(8192))); + // the oldest wal should now be getting_flushed + ASSERT_OK(db_->DropColumnFamily(handles_[0])); + // all flushes should now do nothing because their CF is dropped + TEST_SYNC_POINT("Test:AllowFlushes"); + TEST_SYNC_POINT("Test:WaitForFlush"); + uint64_t lognum1 = dbfull()->TEST_LogfileNumber(); + ASSERT_OK(Put(1, "key3", DummyString(8192))); + ASSERT_OK(Put(1, "key4", DummyString(8192))); + // new wal should have been created + uint64_t lognum2 = dbfull()->TEST_LogfileNumber(); + EXPECT_GT(lognum2, lognum1); +} - delete iter; +TEST_F(DBTest, UnsupportedManualSync) { + DestroyAndReopen(CurrentOptions()); + env_->is_wal_sync_thread_safe_.store(false); + Status s = db_->SyncWAL(); + ASSERT_TRUE(s.IsNotSupported()); } INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam, @@ -10138,326 +5919,8 @@ TEST_F(DBTest, PauseBackgroundWorkTest) { ASSERT_EQ(true, done.load()); } -class SliceTransformLimitedDomain : public SliceTransform { - const char* Name() const override { return "SliceTransformLimitedDomain"; } - - Slice Transform(const Slice& src) const override { - return Slice(src.data(), 5); - } - - bool InDomain(const Slice& src) const override { - // prefix will be x???? - return src.size() >= 5 && src[0] == 'x'; - } - - bool InRange(const Slice& dst) const override { - // prefix will be x???? - return dst.size() == 5 && dst[0] == 'x'; - } -}; - -TEST_F(DBTest, PrefixExtractorFullFilter) { - BlockBasedTableOptions bbto; - // Full Filter Block - bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false)); - bbto.whole_key_filtering = false; - - Options options = CurrentOptions(); - options.prefix_extractor = std::make_shared(); - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - - DestroyAndReopen(options); - - ASSERT_OK(Put("x1111_AAAA", "val1")); - ASSERT_OK(Put("x1112_AAAA", "val2")); - ASSERT_OK(Put("x1113_AAAA", "val3")); - ASSERT_OK(Put("x1114_AAAA", "val4")); - // Not in domain, wont be added to filter - ASSERT_OK(Put("zzzzz_AAAA", "val5")); - - ASSERT_OK(Flush()); - - ASSERT_EQ(Get("x1111_AAAA"), "val1"); - ASSERT_EQ(Get("x1112_AAAA"), "val2"); - ASSERT_EQ(Get("x1113_AAAA"), "val3"); - ASSERT_EQ(Get("x1114_AAAA"), "val4"); - // Was not added to filter but rocksdb will try to read it from the filter - ASSERT_EQ(Get("zzzzz_AAAA"), "val5"); -} - -TEST_F(DBTest, PrefixExtractorBlockFilter) { - BlockBasedTableOptions bbto; - // Block Filter Block - bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true)); - - Options options = CurrentOptions(); - options.prefix_extractor = std::make_shared(); - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - - DestroyAndReopen(options); - - ASSERT_OK(Put("x1113_AAAA", "val3")); - ASSERT_OK(Put("x1114_AAAA", "val4")); - // Not in domain, wont be added to filter - ASSERT_OK(Put("zzzzz_AAAA", "val1")); - ASSERT_OK(Put("zzzzz_AAAB", "val2")); - ASSERT_OK(Put("zzzzz_AAAC", "val3")); - ASSERT_OK(Put("zzzzz_AAAD", "val4")); - - ASSERT_OK(Flush()); - - std::vector iter_res; - auto iter = db_->NewIterator(ReadOptions()); - // Seek to a key that was not in Domain - for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) { - iter_res.emplace_back(iter->value().ToString()); - } - - std::vector expected_res = {"val1", "val2", "val3", "val4"}; - ASSERT_EQ(iter_res, expected_res); - delete iter; -} - -TEST_F(DBTest, IteratorWithLocalStatistics) { - Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - DestroyAndReopen(options); - - Random rnd(301); - for (int i = 0; i < 1000; i++) { - // Key 10 bytes / Value 10 bytes - ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10))); - } - - std::atomic total_next(0); - std::atomic total_next_found(0); - std::atomic total_prev(0); - std::atomic total_prev_found(0); - std::atomic total_bytes(0); - - std::vector threads; - std::function reader_func_next = [&]() { - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - // Seek will bump ITER_BYTES_READ - total_bytes += iter->key().size(); - total_bytes += iter->value().size(); - while (true) { - iter->Next(); - total_next++; - - if (!iter->Valid()) { - break; - } - total_next_found++; - total_bytes += iter->key().size(); - total_bytes += iter->value().size(); - } - - delete iter; - }; - - std::function reader_func_prev = [&]() { - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToLast(); - // Seek will bump ITER_BYTES_READ - total_bytes += iter->key().size(); - total_bytes += iter->value().size(); - while (true) { - iter->Prev(); - total_prev++; - - if (!iter->Valid()) { - break; - } - total_prev_found++; - total_bytes += iter->key().size(); - total_bytes += iter->value().size(); - } - - delete iter; - }; - - for (int i = 0; i < 10; i++) { - threads.emplace_back(reader_func_next); - } - for (int i = 0; i < 15; i++) { - threads.emplace_back(reader_func_prev); - } - - for (auto& t : threads) { - t.join(); - } - - ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT), total_next); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT_FOUND), - total_next_found); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV), total_prev); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV_FOUND), - total_prev_found); - ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ), total_bytes); -} - -#ifndef ROCKSDB_LITE -class BloomStatsTestWithParam - : public DBTest, - public testing::WithParamInterface> { - public: - BloomStatsTestWithParam() { - use_block_table_ = std::get<0>(GetParam()); - use_block_based_builder_ = std::get<1>(GetParam()); - - options_.create_if_missing = true; - options_.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(4)); - options_.memtable_prefix_bloom_bits = 8 * 1024; - if (use_block_table_) { - BlockBasedTableOptions table_options; - table_options.hash_index_allow_collision = false; - table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_builder_)); - options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); - } else { - PlainTableOptions table_options; - options_.table_factory.reset(NewPlainTableFactory(table_options)); - } - - perf_context.Reset(); - DestroyAndReopen(options_); - } - - ~BloomStatsTestWithParam() { - perf_context.Reset(); - Destroy(options_); - } - - // Required if inheriting from testing::WithParamInterface<> - static void SetUpTestCase() {} - static void TearDownTestCase() {} - - bool use_block_table_; - bool use_block_based_builder_; - Options options_; -}; - -// 1 Insert 2 K-V pairs into DB -// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2 -// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1 -// 4 Call Flush() to create SST -// 5 Call Get() for both keys - expext SST bloom hit stat to be 2 -// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1 -// Test both: block and plain SST -TEST_P(BloomStatsTestWithParam, BloomStatsTest) { - std::string key1("AAAA"); - std::string key2("RXDB"); // not in DB - std::string key3("ZBRA"); - std::string value1("Value1"); - std::string value3("Value3"); - - ASSERT_OK(Put(key1, value1, WriteOptions())); - ASSERT_OK(Put(key3, value3, WriteOptions())); - - // check memtable bloom stats - ASSERT_EQ(value1, Get(key1)); - ASSERT_EQ(1, perf_context.bloom_memtable_hit_count); - ASSERT_EQ(value3, Get(key3)); - ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); - ASSERT_EQ(0, perf_context.bloom_memtable_miss_count); - - ASSERT_EQ("NOT_FOUND", Get(key2)); - ASSERT_EQ(1, perf_context.bloom_memtable_miss_count); - ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); - - // sanity checks - ASSERT_EQ(0, perf_context.bloom_sst_hit_count); - ASSERT_EQ(0, perf_context.bloom_sst_miss_count); - - Flush(); - - // sanity checks - ASSERT_EQ(0, perf_context.bloom_sst_hit_count); - ASSERT_EQ(0, perf_context.bloom_sst_miss_count); - - // check SST bloom stats - // NOTE: hits per get differs because of code paths differences - // in BlockBasedTable::Get() - int hits_per_get = use_block_table_ && !use_block_based_builder_ ? 2 : 1; - ASSERT_EQ(value1, Get(key1)); - ASSERT_EQ(hits_per_get, perf_context.bloom_sst_hit_count); - ASSERT_EQ(value3, Get(key3)); - ASSERT_EQ(2 * hits_per_get, perf_context.bloom_sst_hit_count); - - ASSERT_EQ("NOT_FOUND", Get(key2)); - ASSERT_EQ(1, perf_context.bloom_sst_miss_count); -} - -// Same scenario as in BloomStatsTest but using an iterator -TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { - std::string key1("AAAA"); - std::string key2("RXDB"); // not in DB - std::string key3("ZBRA"); - std::string value1("Value1"); - std::string value3("Value3"); - - ASSERT_OK(Put(key1, value1, WriteOptions())); - ASSERT_OK(Put(key3, value3, WriteOptions())); - - unique_ptr iter(dbfull()->NewIterator(ReadOptions())); - - // check memtable bloom stats - iter->Seek(key1); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(value1, iter->value().ToString()); - ASSERT_EQ(1, perf_context.bloom_memtable_hit_count); - ASSERT_EQ(0, perf_context.bloom_memtable_miss_count); - - iter->Seek(key3); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(value3, iter->value().ToString()); - ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); - ASSERT_EQ(0, perf_context.bloom_memtable_miss_count); - - iter->Seek(key2); - ASSERT_OK(iter->status()); - ASSERT_TRUE(!iter->Valid()); - ASSERT_EQ(1, perf_context.bloom_memtable_miss_count); - ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); - - Flush(); - - iter.reset(dbfull()->NewIterator(ReadOptions())); - - // Check SST bloom stats - iter->Seek(key1); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(value1, iter->value().ToString()); - ASSERT_EQ(1, perf_context.bloom_sst_hit_count); - - iter->Seek(key3); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(value3, iter->value().ToString()); - ASSERT_EQ(2, perf_context.bloom_sst_hit_count); - - iter->Seek(key2); - ASSERT_OK(iter->status()); - ASSERT_TRUE(!iter->Valid()); - ASSERT_EQ(1, perf_context.bloom_sst_miss_count); - ASSERT_EQ(2, perf_context.bloom_sst_hit_count); -} - -INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam, - ::testing::Values(std::make_tuple(true, true), - std::make_tuple(true, false), - std::make_tuple(false, false))); -#endif // ROCKSDB_LITE } // namespace rocksdb - int main(int argc, char** argv) { rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); diff --git a/db/db_test2.cc b/db/db_test2.cc index 7c02a6ffe..32487b1c4 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -13,11 +13,6 @@ namespace rocksdb { -static uint64_t TestGetTickerCount(const Options& options, - Tickers ticker_type) { - return options.statistics->getTickerCount(ticker_type); -} - class DBTest2 : public DBTestBase { public: DBTest2() : DBTestBase("/db_test2") {} diff --git a/db/db_test_util.cc b/db/db_test_util.cc index 1af09e286..7e2adaa78 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -1043,4 +1043,34 @@ std::unordered_map DBTestBase::GetAllSSTFiles( return res; } +std::vector DBTestBase::ListTableFiles(Env* env, + const std::string& path) { + std::vector files; + std::vector file_numbers; + env->GetChildren(path, &files); + uint64_t number; + FileType type; + for (size_t i = 0; i < files.size(); ++i) { + if (ParseFileName(files[i], &number, &type)) { + if (type == kTableFile) { + file_numbers.push_back(number); + } + } + } + return file_numbers; +} + +#ifndef ROCKSDB_LITE +uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily( + DB* db, std::string column_family_name) { + std::vector metadata; + db->GetLiveFilesMetaData(&metadata); + uint64_t result = 0; + for (auto& fileMetadata : metadata) { + result += (fileMetadata.column_family_name == column_family_name); + } + return result; +} +#endif // ROCKSDB_LITE + } // namespace rocksdb diff --git a/db/db_test_util.h b/db/db_test_util.h index ca2b466e9..adda26230 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -39,6 +39,7 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" +#include "rocksdb/statistics.h" #include "rocksdb/table.h" #include "rocksdb/utilities/checkpoint.h" #include "table/block_based_table_factory.h" @@ -491,6 +492,31 @@ class SpecialEnv : public EnvWrapper { std::atomic is_wal_sync_thread_safe_{true}; }; +class OnFileDeletionListener : public EventListener { + public: + OnFileDeletionListener() : matched_count_(0), expected_file_name_("") {} + + void SetExpectedFileName(const std::string file_name) { + expected_file_name_ = file_name; + } + + void VerifyMatchedCount(size_t expected_value) { + ASSERT_EQ(matched_count_, expected_value); + } + + void OnTableFileDeleted(const TableFileDeletionInfo& info) override { + if (expected_file_name_ != "") { + ASSERT_EQ(expected_file_name_, info.file_path); + expected_file_name_ = ""; + matched_count_++; + } + } + + private: + size_t matched_count_; + std::string expected_file_name_; +}; + class DBTestBase : public testing::Test { protected: // Sequence of option configurations to try @@ -756,6 +782,17 @@ class DBTestBase : public testing::Test { std::unordered_map GetAllSSTFiles( uint64_t* total_size = nullptr); + + std::vector ListTableFiles(Env* env, const std::string& path); + +#ifndef ROCKSDB_LITE + uint64_t GetNumberOfSstFilesForColumnFamily(DB* db, + std::string column_family_name); +#endif // ROCKSDB_LITE + + uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); + } }; } // namespace rocksdb diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc index ded8ed9b4..868039a5c 100644 --- a/db/db_universal_compaction_test.cc +++ b/db/db_universal_compaction_test.cc @@ -14,11 +14,6 @@ namespace rocksdb { -static uint64_t TestGetTickerCount(const Options& options, - Tickers ticker_type) { - return options.statistics->getTickerCount(ticker_type); -} - static std::string CompressibleString(Random* rnd, int len) { std::string r; test::CompressibleString(rnd, 0.8, len, &r); diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 14b9e2ffd..bb944735d 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -127,6 +127,634 @@ TEST_F(DBWALTest, SyncWALNotWaitWrite) { ASSERT_EQ(Get("foo2"), "bar2"); rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } + +TEST_F(DBWALTest, Recover) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "baz", "v5")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v1", Get(1, "foo")); + + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "baz")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Put(1, "foo", "v3")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v4")); + ASSERT_EQ("v4", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v5", Get(1, "baz")); + } while (ChangeOptions()); +} + +TEST_F(DBWALTest, RecoverWithTableHandle) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "v3")); + ASSERT_OK(Put(1, "bar", "v4")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "big", std::string(100, 'a'))); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + + std::vector> files; + dbfull()->TEST_GetFilesMetaData(handles_[1], &files); + size_t total_files = 0; + for (const auto& level : files) { + total_files += level.size(); + } + ASSERT_EQ(total_files, 3); + for (const auto& level : files) { + for (const auto& file : level) { + if (kInfiniteMaxOpenFiles == option_config_) { + ASSERT_TRUE(file.table_reader_handle != nullptr); + } else { + ASSERT_TRUE(file.table_reader_handle == nullptr); + } + } + } + } while (ChangeOptions()); +} + +TEST_F(DBWALTest, IgnoreRecoveredLog) { + std::string backup_logs = dbname_ + "/backup_logs"; + + // delete old files in backup_logs directory + env_->CreateDirIfMissing(backup_logs); + std::vector old_files; + env_->GetChildren(backup_logs, &old_files); + for (auto& file : old_files) { + if (file != "." && file != "..") { + env_->DeleteFile(backup_logs + "/" + file); + } + } + + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + options.wal_dir = dbname_ + "/logs"; + DestroyAndReopen(options); + + // fill up the DB + std::string one, two; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one))); + + // copy the logs to backup + std::vector logs; + env_->GetChildren(options.wal_dir, &logs); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log); + } + } + + // recover the DB + Reopen(options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + Close(); + + // copy the logs from backup back to wal dir + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + } + } + // this should ignore the log files, recovery should not happen again + // if the recovery happens, the same merge operator would be called twice, + // leading to incorrect results + Reopen(options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + Close(); + Destroy(options); + Reopen(options); + Close(); + + // copy the logs from backup back to wal dir + env_->CreateDirIfMissing(options.wal_dir); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + } + } + // assert that we successfully recovered only from logs, even though we + // destroyed the DB + Reopen(options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + + // Recovery will fail if DB directory doesn't exist. + Destroy(options); + // copy the logs from backup back to wal dir + env_->CreateDirIfMissing(options.wal_dir); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + // we won't be needing this file no more + env_->DeleteFile(backup_logs + "/" + log); + } + } + Status s = TryReopen(options); + ASSERT_TRUE(!s.ok()); + } while (ChangeOptions(kSkipHashCuckoo)); +} + +TEST_F(DBWALTest, RecoveryWithEmptyLog) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "foo", "v2")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v3")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v3", Get(1, "foo")); + } while (ChangeOptions()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBWALTest, RecoverWithLargeLog) { + do { + { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, "big1", std::string(200000, '1'))); + ASSERT_OK(Put(1, "big2", std::string(200000, '2'))); + ASSERT_OK(Put(1, "small3", std::string(10, '3'))); + ASSERT_OK(Put(1, "small4", std::string(10, '4'))); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + } + + // Make sure that if we re-open with a small write buffer size that + // we flush table files in the middle of a large log file. + Options options; + options.write_buffer_size = 100000; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3); + ASSERT_EQ(std::string(200000, '1'), Get(1, "big1")); + ASSERT_EQ(std::string(200000, '2'), Get(1, "big2")); + ASSERT_EQ(std::string(10, '3'), Get(1, "small3")); + ASSERT_EQ(std::string(10, '4'), Get(1, "small4")); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 1); + } while (ChangeCompactOptions()); +} + +// In https://reviews.facebook.net/D20661 we change +// recovery behavior: previously for each log file each column family +// memtable was flushed, even it was empty. Now it's changed: +// we try to create the smallest number of table files by merging +// updates from multiple logs +TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) { + Options options = CurrentOptions(); + options.write_buffer_size = 5000000; + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); + + // Since we will reopen DB with smaller write_buffer_size, + // each key will go to new SST file + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + + ASSERT_OK(Put(3, Key(10), DummyString(1))); + // Make 'dobrynia' to be flushed and new WAL file to be created + ASSERT_OK(Put(2, Key(10), DummyString(7500000))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(1)); + // Make sure 'dobrynia' was flushed: check sst files amount + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + } + // New WAL file + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + + options.write_buffer_size = 4096; + options.arena_block_size = 4096; + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + options); + { + // No inserts => default is empty + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(5)); + // 1 SST for big key + 1 SST for small one + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(2)); + // 1 SST for all keys + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } +} + +// In https://reviews.facebook.net/D20661 we change +// recovery behavior: previously for each log file each column family +// memtable was flushed, even it wasn't empty. Now it's changed: +// we try to create the smallest number of table files by merging +// updates from multiple logs +TEST_F(DBWALTest, RecoverCheckFileAmount) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000; + options.arena_block_size = 4 * 1024; + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); + + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + // Make 'nikitich' memtable to be flushed + ASSERT_OK(Put(3, Key(10), DummyString(1002400))); + ASSERT_OK(Put(3, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // 4 memtable are not flushed, 1 sst file + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } + // Memtable for 'nikitich' has flushed, new WAL file has opened + // 4 memtable still not flushed + + // Write to new WAL file + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + // Fill up 'nikitich' one more time + ASSERT_OK(Put(3, Key(10), DummyString(1002400))); + // make it flush + ASSERT_OK(Put(3, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // There are still 4 memtable not flushed, and 2 sst tables + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + options); + { + std::vector table_files = ListTableFiles(env_, dbname_); + // Check, that records for 'default', 'dobrynia' and 'pikachu' from + // first, second and third WALs went to the same SST. + // So, there is 6 SSTs: three for 'nikitich', one for 'default', one for + // 'dobrynia', one for 'pikachu' + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(3)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(1)); + } +} + +TEST_F(DBWALTest, SyncMultipleLogs) { + const uint64_t kNumBatches = 2; + const int kBatchSize = 1000; + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.write_buffer_size = 4096; + Reopen(options); + + WriteBatch batch; + WriteOptions wo; + wo.sync = true; + + for (uint64_t b = 0; b < kNumBatches; b++) { + batch.Clear(); + for (int i = 0; i < kBatchSize; i++) { + batch.Put(Key(i), DummyString(128)); + } + + dbfull()->Write(wo, &batch); + } + + ASSERT_OK(dbfull()->SyncWAL()); +} + +// +// Test WAL recovery for the various modes available +// +class RecoveryTestHelper { + public: + // Number of WAL files to generate + static const int kWALFilesCount = 10; + // Starting number for the WAL file name like 00010.log + static const int kWALFileOffset = 10; + // Keys to be written per WAL file + static const int kKeysPerWALFile = 1024; + // Size of the value + static const int kValueSize = 10; + + // Create WAL files with values filled in + static void FillData(DBWALTest* test, const Options& options, + const size_t wal_count, size_t* count) { + const DBOptions& db_options = options; + + *count = 0; + + shared_ptr table_cache = NewLRUCache(50000, 16); + EnvOptions env_options; + WriteBuffer write_buffer(db_options.db_write_buffer_size); + + unique_ptr versions; + unique_ptr wal_manager; + WriteController write_controller; + + versions.reset(new VersionSet(test->dbname_, &db_options, env_options, + table_cache.get(), &write_buffer, + &write_controller)); + + wal_manager.reset(new WalManager(db_options, env_options)); + + std::unique_ptr current_log_writer; + + for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) { + uint64_t current_log_number = j; + std::string fname = LogFileName(test->dbname_, current_log_number); + unique_ptr file; + ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options)); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options)); + current_log_writer.reset( + new log::Writer(std::move(file_writer), current_log_number, + db_options.recycle_log_file_num > 0)); + + for (int i = 0; i < kKeysPerWALFile; i++) { + std::string key = "key" + ToString((*count)++); + std::string value = test->DummyString(kValueSize); + assert(current_log_writer.get() != nullptr); + uint64_t seq = versions->LastSequence() + 1; + WriteBatch batch; + batch.Put(key, value); + WriteBatchInternal::SetSequence(&batch, seq); + current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch)); + versions->SetLastSequence(seq); + } + } + } + + // Recreate and fill the store with some data + static size_t FillData(DBWALTest* test, Options* options) { + options->create_if_missing = true; + test->DestroyAndReopen(*options); + test->Close(); + + size_t count = 0; + FillData(test, *options, kWALFilesCount, &count); + return count; + } + + // Read back all the keys we wrote and return the number of keys found + static size_t GetData(DBWALTest* test) { + size_t count = 0; + for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) { + if (test->Get("key" + ToString(i)) != "NOT_FOUND") { + ++count; + } + } + return count; + } + + // Manuall corrupt the specified WAL + static void CorruptWAL(DBWALTest* test, const Options& options, + const double off, const double len, + const int wal_file_id, const bool trunc = false) { + Env* env = options.env; + std::string fname = LogFileName(test->dbname_, wal_file_id); + uint64_t size; + ASSERT_OK(env->GetFileSize(fname, &size)); + ASSERT_GT(size, 0); +#ifdef OS_WIN + // Windows disk cache behaves differently. When we truncate + // the original content is still in the cache due to the original + // handle is still open. Generally, in Windows, one prohibits + // shared access to files and it is not needed for WAL but we allow + // it to induce corruption at various tests. + test->Close(); +#endif + if (trunc) { + ASSERT_EQ(0, truncate(fname.c_str(), static_cast(size * off))); + } else { + InduceCorruption(fname, static_cast(size * off), + static_cast(size * len)); + } + } + + // Overwrite data with 'a' from offset for length len + static void InduceCorruption(const std::string& filename, size_t offset, + size_t len) { + ASSERT_GT(len, 0U); + + int fd = open(filename.c_str(), O_RDWR); + + ASSERT_GT(fd, 0); + ASSERT_EQ(offset, lseek(fd, offset, SEEK_SET)); + + void* buf = alloca(len); + memset(buf, 'a', len); + ASSERT_EQ(len, write(fd, buf, static_cast(len))); + + close(fd); + } +}; + +// Test scope: +// - We expect to open the data store when there is incomplete trailing writes +// at the end of any of the logs +// - We do not expect to open the data store for corruption +TEST_F(DBWALTest, kTolerateCorruptedTailRecords) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + + for (auto trunc : {true, false}) { /* Corruption style */ + for (int i = 0; i < 4; i++) { /* Corruption offset position */ + for (int j = jstart; j < jend; j++) { /* WAL file */ + // Fill data for testing + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + // test checksum failure or parsing + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, /*wal=*/j, trunc); + + if (trunc) { + options.wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + const size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_TRUE(i == 0 || recovered_row_count > 0); + ASSERT_LT(recovered_row_count, row_count); + } else { + options.wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords; + ASSERT_NOK(TryReopen(options)); + } + } + } + } +} + +// Test scope: +// We don't expect the data store to be opened if there is any corruption +// (leading, middle or trailing -- incomplete writes or corruption) +TEST_F(DBWALTest, kAbsoluteConsistency) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + + // Verify clean slate behavior + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count); + + for (auto trunc : {true, false}) { /* Corruption style */ + for (int i = 0; i < 4; i++) { /* Corruption offset position */ + if (trunc && i == 0) { + continue; + } + + for (int j = jstart; j < jend; j++) { /* wal files */ + // fill with new date + RecoveryTestHelper::FillData(this, &options); + // corrupt the wal + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, j, trunc); + // verify + options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; + options.create_if_missing = false; + ASSERT_NOK(TryReopen(options)); + } + } + } +} + +// Test scope: +// - We expect to open data store under all circumstances +// - We expect only data upto the point where the first error was encountered +TEST_F(DBWALTest, kPointInTimeRecovery) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + const int maxkeys = + RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile; + + for (auto trunc : {true, false}) { /* Corruption style */ + for (int i = 0; i < 4; i++) { /* Offset of corruption */ + for (int j = jstart; j < jend; j++) { /* WAL file */ + // Fill data for testing + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + + // Corrupt the wal + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, j, trunc); + + // Verify + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + + // Probe data for invariants + size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_LT(recovered_row_count, row_count); + + bool expect_data = true; + for (size_t k = 0; k < maxkeys; ++k) { + bool found = Get("key" + ToString(i)) != "NOT_FOUND"; + if (expect_data && !found) { + expect_data = false; + } + ASSERT_EQ(found, expect_data); + } + + const size_t min = RecoveryTestHelper::kKeysPerWALFile * + (j - RecoveryTestHelper::kWALFileOffset); + ASSERT_GE(recovered_row_count, min); + if (!trunc && i != 0) { + const size_t max = RecoveryTestHelper::kKeysPerWALFile * + (j - RecoveryTestHelper::kWALFileOffset + 1); + ASSERT_LE(recovered_row_count, max); + } + } + } + } +} + +// Test scope: +// - We expect to open the data store under all scenarios +// - We expect to have recovered records past the corruption zone +TEST_F(DBWALTest, kSkipAnyCorruptedRecords) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + + for (auto trunc : {true, false}) { /* Corruption style */ + for (int i = 0; i < 4; i++) { /* Corruption offset */ + for (int j = jstart; j < jend; j++) { /* wal files */ + // Fill data for testing + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + + // Corrupt the WAL + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, j, trunc); + + // Verify behavior + options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + + // Probe data for invariants + size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_LT(recovered_row_count, row_count); + + if (!trunc) { + ASSERT_TRUE(i != 0 || recovered_row_count > 0); + } + } + } + } +} + +#endif // ROCKSDB_LITE + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/src.mk b/src.mk index 9f3671305..c53627cef 100644 --- a/src.mk +++ b/src.mk @@ -198,13 +198,17 @@ TEST_BENCH_SOURCES = \ db/dbformat_test.cc \ db/db_iter_test.cc \ db/db_test.cc \ + db/db_block_cache_test.cc \ + db/db_bloom_filter_test.cc \ db/db_compaction_filter_test.cc \ db/db_compaction_test.cc \ db/db_dynamic_level_test.cc \ db/db_inplace_update_test.cc \ + db/db_iterator_test.cc \ db/db_log_iter_test.cc \ - db/db_universal_compaction_test.cc \ + db/db_sst_test.cc \ db/db_tailing_iter_test.cc \ + db/db_universal_compaction_test.cc \ db/db_wal_test.cc \ db/db_table_properties_test.cc \ db/deletefile_test.cc \