diff --git a/CMakeLists.txt b/CMakeLists.txt index 3655c79c2..98efe3892 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -294,6 +294,7 @@ set(TESTS db/db_universal_compaction_test.cc db/db_wal_test.cc db/db_tailing_iter_test.cc + db/db_table_properties_test.cc db/dbformat_test.cc db/deletefile_test.cc db/fault_injection_test.cc diff --git a/Makefile b/Makefile index 34152533d..89d8ab56c 100644 --- a/Makefile +++ b/Makefile @@ -230,6 +230,7 @@ TESTS = \ db_tailing_iter_test \ db_universal_compaction_test \ db_wal_test \ + db_table_properties_test \ block_hash_index_test \ autovector_test \ column_family_test \ @@ -741,6 +742,9 @@ db_universal_compaction_test: db/db_universal_compaction_test.o db/db_test_util. db_wal_test: db/db_wal_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +db_table_properties_test: db/db_table_properties_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) $(pg) diff --git a/db/db_impl.cc b/db/db_impl.cc index 179455f35..42df6ec66 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -4161,6 +4161,29 @@ Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, return s; } + +Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, + const Range* range, int n, + TablePropertiesCollection* props) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + // Increment the ref count + mutex_.Lock(); + auto version = cfd->current(); + version->Ref(); + mutex_.Unlock(); + + auto s = version->GetPropertiesOfTablesInRange(range, n, props); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} + #endif // ROCKSDB_LITE const std::string& DBImpl::GetName() const { diff --git a/db/db_impl.h b/db/db_impl.h index a896fad26..3d8754634 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -810,6 +810,10 @@ class DBImpl : public DB { virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, TablePropertiesCollection* props) override; + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, int n, + TablePropertiesCollection* props) override; + #endif // ROCKSDB_LITE // Function that Get and KeyMayExist call with no_io true or false diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc new file mode 100644 index 000000000..aff61baa8 --- /dev/null +++ b/db/db_table_properties_test.cc @@ -0,0 +1,217 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "util/testharness.h" +#include "util/testutil.h" + +#ifndef ROCKSDB_LITE + +namespace rocksdb { + +// A helper function that ensures the table properties returned in +// `GetPropertiesOfAllTablesTest` is correct. +// This test assumes entries size is different for each of the tables. +namespace { + +void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { + TablePropertiesCollection props; + ASSERT_OK(db->GetPropertiesOfAllTables(&props)); + + ASSERT_EQ(4U, props.size()); + std::unordered_set unique_entries; + + // Indirect test + uint64_t sum = 0; + for (const auto& item : props) { + unique_entries.insert(item.second->num_entries); + sum += item.second->num_entries; + } + + ASSERT_EQ(props.size(), unique_entries.size()); + ASSERT_EQ(expected_entries_size, sum); +} +} // namespace + +class DBTablePropertiesTest : public DBTestBase { + public: + DBTablePropertiesTest() : DBTestBase("/db_table_properties_test") {} + TablePropertiesCollection TestGetPropertiesOfTablesInRange( + std::vector ranges, std::size_t* num_properties = nullptr, + std::size_t* num_files = nullptr); +}; + +TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 8; + Reopen(options); + // Create 4 tables + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); + } + db_->Flush(FlushOptions()); + } + + // 1. Read table properties directly from file + Reopen(options); + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 2. Put two tables to table cache and + Reopen(options); + // fetch key from 1st and 2nd table, which will internally place that table to + // the table cache. + for (int i = 0; i < 2; ++i) { + Get(ToString(i * 100 + 0)); + } + + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 3. Put all tables to table cache + Reopen(options); + // fetch key from 1st and 2nd table, which will internally place that table to + // the table cache. + for (int i = 0; i < 4; ++i) { + Get(ToString(i * 100 + 0)); + } + VerifyTableProperties(db_, 10 + 11 + 12 + 13); +} + +TablePropertiesCollection +DBTablePropertiesTest::TestGetPropertiesOfTablesInRange( + std::vector ranges, std::size_t* num_properties, + std::size_t* num_files) { + // run the query + TablePropertiesCollection props; + EXPECT_OK(db_->GetPropertiesOfTablesInRange( + db_->DefaultColumnFamily(), &ranges[0], ranges.size(), &props)); + + // Make sure that we've received properties for those and for those files + // only which fall within requested ranges + std::vector vmd; + db_->GetLiveFilesMetaData(&vmd); + for (auto md : vmd) { + std::string fn = md.db_path + md.name; + bool in_range = false; + for (auto r : ranges) { + // smallestkey < limit && largestkey >= start + if (r.limit.compare(md.smallestkey) >= 0 && + r.start.compare(md.largestkey) <= 0) { + in_range = true; + EXPECT_GT(props.count(fn), 0); + } + } + if (!in_range) { + EXPECT_EQ(props.count(fn), 0); + } + } + + if (num_properties) { + *num_properties = props.size(); + } + + if (num_files) { + *num_files = vmd.size(); + } + return props; +} + +TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) { + // Fixed random sead + Random rnd(301); + + Options options; + options.create_if_missing = true; + options.write_buffer_size = 4096; + options.max_write_buffer_number = 8; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.target_file_size_base = 2048; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.soft_rate_limit = 1.1; + options.num_levels = 8; + + DestroyAndReopen(options); + + // build a decent LSM + for (int i = 0; i < 10000; i++) { + ASSERT_OK(Put(test::RandomKey(&rnd, 5), RandomString(&rnd, 102))); + } + Flush(); + db_->PauseBackgroundWork(); + + // Ensure that we have at least L0, L1 and L2 + ASSERT_GT(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ASSERT_GT(NumTableFilesAtLevel(2), 0); + + // Query the largest range + std::size_t num_properties, num_files; + TestGetPropertiesOfTablesInRange( + {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST), + test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))}, + &num_properties, &num_files); + ASSERT_EQ(num_properties, num_files); + + // Query the empty range + TestGetPropertiesOfTablesInRange( + {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST), + test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST))}, + &num_properties, &num_files); + ASSERT_GT(num_files, 0); + ASSERT_EQ(num_properties, 0); + + // Query the middle rangee + TestGetPropertiesOfTablesInRange( + {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::MIDDLE), + test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))}, + &num_properties, &num_files); + ASSERT_GT(num_files, 0); + ASSERT_GT(num_files, num_properties); + ASSERT_GT(num_properties, 0); + + // Query a bunch of random ranges + for (int j = 0; j < 100; j++) { + // create a bunch of ranges + std::vector random_keys; + auto n = 2 * rnd.Uniform(50); + for (uint i = 0; i < n; ++i) { + random_keys.push_back(test::RandomKey(&rnd, 5)); + } + + std::vector ranges; + auto it = random_keys.begin(); + while (it != random_keys.end()) { + ranges.push_back(Range(*it, *(it + 1))); + it += 2; + } + + TestGetPropertiesOfTablesInRange(std::move(ranges)); + } +} +} // namespace rocksdb + +#endif // ROCKSDB_LITE + +int main(int argc, char** argv) { +#if !(defined NDEBUG) || !defined(OS_WIN) + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + return 0; +#endif +} diff --git a/db/db_test.cc b/db/db_test.cc index abc33322d..83db98c55 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -84,24 +84,6 @@ static long TestGetTickerCount(const Options& options, Tickers ticker_type) { // This test assumes entries size is different for each of the tables. namespace { -void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { - TablePropertiesCollection props; - ASSERT_OK(db->GetPropertiesOfAllTables(&props)); - - ASSERT_EQ(4U, props.size()); - std::unordered_set unique_entries; - - // Indirect test - uint64_t sum = 0; - for (const auto& item : props) { - unique_entries.insert(item.second->num_entries); - sum += item.second->num_entries; - } - - ASSERT_EQ(props.size(), unique_entries.size()); - ASSERT_EQ(expected_entries_size, sum); -} - uint64_t GetNumberOfSstFilesForColumnFamily(DB* db, std::string column_family_name) { std::vector metadata; @@ -440,42 +422,6 @@ TEST_F(DBTest, ParanoidFileChecks) { TestGetTickerCount(options, BLOCK_CACHE_ADD)); } -TEST_F(DBTest, GetPropertiesOfAllTablesTest) { - Options options = CurrentOptions(); - options.level0_file_num_compaction_trigger = 8; - Reopen(options); - // Create 4 tables - for (int table = 0; table < 4; ++table) { - for (int i = 0; i < 10 + table; ++i) { - db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); - } - db_->Flush(FlushOptions()); - } - - // 1. Read table properties directly from file - Reopen(options); - VerifyTableProperties(db_, 10 + 11 + 12 + 13); - - // 2. Put two tables to table cache and - Reopen(options); - // fetch key from 1st and 2nd table, which will internally place that table to - // the table cache. - for (int i = 0; i < 2; ++i) { - Get(ToString(i * 100 + 0)); - } - - VerifyTableProperties(db_, 10 + 11 + 12 + 13); - - // 3. Put all tables to table cache - Reopen(options); - // fetch key from 1st and 2nd table, which will internally place that table to - // the table cache. - for (int i = 0; i < 4; ++i) { - Get(ToString(i * 100 + 0)); - } - VerifyTableProperties(db_, 10 + 11 + 12 + 13); -} - namespace { void ResetTableProperties(TableProperties* tp) { tp->data_size = 0; @@ -5692,6 +5638,12 @@ class ModelDB: public DB { TablePropertiesCollection* props) override { return Status(); } + + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, int n, + TablePropertiesCollection* props) override { + return Status(); + } #endif // ROCKSDB_LITE using DB::KeyMayExist; diff --git a/db/version_set.cc b/db/version_set.cc index 054a3442f..5ddecbb53 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -542,7 +542,7 @@ class BaseReferencedVersionBuilder { Status Version::GetTableProperties(std::shared_ptr* tp, const FileMetaData* file_meta, - const std::string* fname) { + const std::string* fname) const { auto table_cache = cfd_->table_cache(); auto ioptions = cfd_->ioptions(); Status s = table_cache->GetTableProperties( @@ -624,6 +624,38 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, return Status::OK(); } +Status Version::GetPropertiesOfTablesInRange( + const Range* range, int n, TablePropertiesCollection* props) const { + for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { + for (int i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + std::vector files; + storage_info_.GetOverlappingInputs(level, &k1, &k2, &files, -1, nullptr, + false); + for (const auto& file_meta : files) { + auto fname = + TableFileName(vset_->db_options_->db_paths, + file_meta->fd.GetNumber(), file_meta->fd.GetPathId()); + if (props->count(fname) == 0) { + // 1. If the table is already present in table cache, load table + // properties from there. + std::shared_ptr table_properties; + Status s = GetTableProperties(&table_properties, file_meta, &fname); + if (s.ok()) { + props->insert({fname, table_properties}); + } else { + return s; + } + } + } + } + } + + return Status::OK(); +} + Status Version::GetAggregatedTableProperties( std::shared_ptr* tp, int level) { TablePropertiesCollection props; @@ -1406,7 +1438,8 @@ bool VersionStorageInfo::OverlapInLevel(int level, // The file_index returns a pointer to any file in an overlapping range. void VersionStorageInfo::GetOverlappingInputs( int level, const InternalKey* begin, const InternalKey* end, - std::vector* inputs, int hint_index, int* file_index) { + std::vector* inputs, int hint_index, int* file_index, + bool expand_range) const { if (level >= num_non_empty_levels_) { // this level is empty, no overlapping inputs return; @@ -1439,7 +1472,7 @@ void VersionStorageInfo::GetOverlappingInputs( // "f" is completely after specified range; skip it } else { inputs->push_back(files_[level][i-1]); - if (level == 0) { + if (level == 0 && expand_range) { // Level-0 files may overlap each other. So check if the newly // added file has expanded the range. If so, restart search. if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) { @@ -1465,7 +1498,7 @@ void VersionStorageInfo::GetOverlappingInputs( // forwards to find all overlapping files. void VersionStorageInfo::GetOverlappingInputsBinarySearch( int level, const Slice& user_begin, const Slice& user_end, - std::vector* inputs, int hint_index, int* file_index) { + std::vector* inputs, int hint_index, int* file_index) const { assert(level > 0); int min = 0; int mid = 0; @@ -1513,8 +1546,7 @@ void VersionStorageInfo::GetOverlappingInputsBinarySearch( // Use FileLevel in searching, make it faster void VersionStorageInfo::ExtendOverlappingInputs( int level, const Slice& user_begin, const Slice& user_end, - std::vector* inputs, unsigned int midIndex) { - + std::vector* inputs, unsigned int midIndex) const { const Comparator* user_cmp = user_comparator_; const FdWithKeyRange* files = level_files_brief_[level].files; #ifndef NDEBUG diff --git a/db/version_set.h b/db/version_set.h index a9ec14b94..1f4345575 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -159,23 +159,26 @@ class VersionStorageInfo { int level, const InternalKey* begin, // nullptr means before all keys const InternalKey* end, // nullptr means after all keys std::vector* inputs, - int hint_index = -1, // index of overlap file - int* file_index = nullptr); // return index of overlap file + int hint_index = -1, // index of overlap file + int* file_index = nullptr, // return index of overlap file + bool expand_range = true) // if set, returns files which overlap the + const; // range and overlap each other. If false, + // then just files intersecting the range void GetOverlappingInputsBinarySearch( int level, const Slice& begin, // nullptr means before all keys const Slice& end, // nullptr means after all keys std::vector* inputs, - int hint_index, // index of overlap file - int* file_index); // return index of overlap file + int hint_index, // index of overlap file + int* file_index) const; // return index of overlap file void ExtendOverlappingInputs( int level, const Slice& begin, // nullptr means before all keys const Slice& end, // nullptr means after all keys std::vector* inputs, - unsigned int index); // start extending from this index + unsigned int index) const; // start extending from this index // Returns true iff some file in the specified level overlaps // some part of [*smallest_user_key,*largest_user_key]. @@ -456,15 +459,16 @@ class Version { // file-name conversion. Status GetTableProperties(std::shared_ptr* tp, const FileMetaData* file_meta, - const std::string* fname = nullptr); + const std::string* fname = nullptr) const; // REQUIRES: lock is held // On success, *props will be populated with all SSTables' table properties. // The keys of `props` are the sst file name, the values of `props` are the // tables' propertis, represented as shared_ptr. Status GetPropertiesOfAllTables(TablePropertiesCollection* props); - Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level); + Status GetPropertiesOfTablesInRange(const Range* range, int n, + TablePropertiesCollection* props) const; // REQUIRES: lock is held // On success, "tp" will contains the aggregated table property amoug diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 0ded47eb4..c32957d26 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -716,6 +716,9 @@ class DB { virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { return GetPropertiesOfAllTables(DefaultColumnFamily(), props); } + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, int n, + TablePropertiesCollection* props) = 0; #endif // ROCKSDB_LITE // Needed for StackableDB diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index aef192b07..ed67e0436 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -279,6 +279,13 @@ class StackableDB : public DB { return db_->GetPropertiesOfAllTables(column_family, props); } + using DB::GetPropertiesOfTablesInRange; + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, int n, + TablePropertiesCollection* props) override { + return db_->GetPropertiesOfTablesInRange(column_family, range, n, props); + } + virtual Status GetUpdatesSince( SequenceNumber seq_number, unique_ptr* iter, const TransactionLogIterator::ReadOptions& read_options) override { diff --git a/src.mk b/src.mk index 7b3f15b19..cb43744be 100644 --- a/src.mk +++ b/src.mk @@ -190,6 +190,7 @@ TEST_BENCH_SOURCES = \ db/db_universal_compaction_test.cc \ db/db_tailing_iter_test.cc \ db/db_wal_test.cc \ + db/db_table_properties_test.cc \ db/deletefile_test.cc \ db/fault_injection_test.cc \ db/file_indexer_test.cc \ diff --git a/util/testutil.cc b/util/testutil.cc index 5f7422172..b995a2e53 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -33,7 +33,7 @@ extern std::string RandomHumanReadableString(Random* rnd, int len) { return ret; } -std::string RandomKey(Random* rnd, int len) { +std::string RandomKey(Random* rnd, int len, RandomKeyType type) { // Make sure to generate a wide variety of characters so we // test the boundary conditions for short-key optimizations. static const char kTestChars[] = { @@ -41,7 +41,22 @@ std::string RandomKey(Random* rnd, int len) { }; std::string result; for (int i = 0; i < len; i++) { - result += kTestChars[rnd->Uniform(sizeof(kTestChars))]; + std::size_t indx = 0; + switch (type) { + case RandomKeyType::RANDOM: + indx = rnd->Uniform(sizeof(kTestChars)); + break; + case RandomKeyType::LARGEST: + indx = sizeof(kTestChars) - 1; + break; + case RandomKeyType::MIDDLE: + indx = sizeof(kTestChars) / 2; + break; + case RandomKeyType::SMALLEST: + indx = 0; + break; + } + result += kTestChars[indx]; } return result; } diff --git a/util/testutil.h b/util/testutil.h index 5304ab163..0bde21c31 100644 --- a/util/testutil.h +++ b/util/testutil.h @@ -35,7 +35,9 @@ extern std::string RandomHumanReadableString(Random* rnd, int len); // Return a random key with the specified length that may contain interesting // characters (e.g. \x00, \xff, etc.). -extern std::string RandomKey(Random* rnd, int len); +enum RandomKeyType : char { RANDOM, LARGEST, SMALLEST, MIDDLE }; +extern std::string RandomKey(Random* rnd, int len, + RandomKeyType type = RandomKeyType::RANDOM); // Store in *dst a string of length "len" that will compress to // "N*compressed_fraction" bytes and return a Slice that references