diff --git a/Makefile b/Makefile index cc5936508..515db7da6 100644 --- a/Makefile +++ b/Makefile @@ -106,6 +106,7 @@ TESTS = \ backupable_db_test \ version_edit_test \ version_set_test \ + file_indexer_test \ write_batch_test\ deletefile_test \ table_test \ @@ -376,6 +377,9 @@ version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +file_indexer_test : db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) diff --git a/db/file_indexer.cc b/db/file_indexer.cc new file mode 100644 index 000000000..2de766024 --- /dev/null +++ b/db/file_indexer.cc @@ -0,0 +1,202 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/file_indexer.h" +#include +#include "rocksdb/comparator.h" +#include "db/version_edit.h" + +namespace rocksdb { + +FileIndexer::FileIndexer(const uint32_t num_levels, + const Comparator* ucmp) + : num_levels_(num_levels), + ucmp_(ucmp), + next_level_index_(num_levels), + level_rb_(num_levels, -1) { +} + + +uint32_t FileIndexer::NumLevelIndex() { + return next_level_index_.size(); +} + +uint32_t FileIndexer::LevelIndexSize(uint32_t level) { + return next_level_index_[level].size(); +} + +void FileIndexer::GetNextLevelIndex( + const uint32_t level, const uint32_t file_index, const int cmp_smallest, + const int cmp_largest, int32_t* left_bound, int32_t* right_bound) { + assert(level > 0); + + // Last level, no hint + if (level == num_levels_ - 1) { + *left_bound = 0; + *right_bound = -1; + return; + } + + assert(level < num_levels_ - 1); + assert(static_cast(file_index) <= level_rb_[level]); + + const auto& index = next_level_index_[level][file_index]; + + if (cmp_smallest < 0) { + *left_bound = (level > 0 && file_index > 0) ? + next_level_index_[level][file_index - 1].largest_lb : 0; + *right_bound = index.smallest_rb; + } else if (cmp_smallest == 0) { + *left_bound = index.smallest_lb; + *right_bound = index.smallest_rb; + } else if (cmp_smallest > 0 && cmp_largest < 0) { + *left_bound = index.smallest_lb; + *right_bound = index.largest_rb; + } else if (cmp_largest == 0) { + *left_bound = index.largest_lb; + *right_bound = index.largest_rb; + } else if (cmp_largest > 0) { + *left_bound = index.largest_lb; + *right_bound = level_rb_[level + 1]; + } else { + assert(false); + } + + assert(*left_bound >= 0); + assert(*left_bound <= *right_bound + 1); + assert(*right_bound <= level_rb_[level + 1]); +} + +void FileIndexer::ClearIndex() { + for (uint32_t level = 1; level < num_levels_; ++level) { + next_level_index_[level].clear(); + } +} + +void FileIndexer::UpdateIndex(std::vector* const files) { + if (files == nullptr) { + return; + } + + // L1 - Ln-1 + for (uint32_t level = 1; level < num_levels_ - 1; ++level) { + const auto& upper_files = files[level]; + const int32_t upper_size = upper_files.size(); + const auto& lower_files = files[level + 1]; + level_rb_[level] = upper_files.size() - 1; + if (upper_size == 0) { + continue; + } + auto& index = next_level_index_[level]; + index.resize(upper_size); + + CalculateLB(upper_files, lower_files, &index, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->Compare(a->smallest.user_key(), b->largest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { + index->smallest_lb = f_idx; + }); + CalculateLB(upper_files, lower_files, &index, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->Compare(a->largest.user_key(), b->largest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { + index->largest_lb = f_idx; + }); + CalculateRB(upper_files, lower_files, &index, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->Compare(a->smallest.user_key(), b->smallest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { + index->smallest_rb = f_idx; + }); + CalculateRB(upper_files, lower_files, &index, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->Compare(a->largest.user_key(), b->smallest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { + index->largest_rb = f_idx; + }); + } + level_rb_[num_levels_ - 1] = files[num_levels_ - 1].size() - 1; +} + +void FileIndexer::CalculateLB(const std::vector& upper_files, + const std::vector& lower_files, + std::vector* index, + std::function cmp_op, + std::function set_index) { + const int32_t upper_size = upper_files.size(); + const int32_t lower_size = lower_files.size(); + int32_t upper_idx = 0; + int32_t lower_idx = 0; + while (upper_idx < upper_size && lower_idx < lower_size) { + int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]); + + if (cmp == 0) { + set_index(&(*index)[upper_idx], lower_idx); + ++upper_idx; + ++lower_idx; + } else if (cmp > 0) { + // Lower level's file (largest) is smaller, a key won't hit in that + // file. Move to next lower file + ++lower_idx; + } else { + // Lower level's file becomes larger, update the index, and + // move to the next upper file + set_index(&(*index)[upper_idx], lower_idx); + ++upper_idx; + } + } + + while (upper_idx < upper_size) { + // Lower files are exhausted, that means the remaining upper files are + // greater than any lower files. Set the index to be the lower level size. + set_index(&(*index)[upper_idx], lower_size); + ++upper_idx; + } +} + +void FileIndexer::CalculateRB(const std::vector& upper_files, + const std::vector& lower_files, + std::vector* index, + std::function cmp_op, + std::function set_index) { + const int32_t upper_size = upper_files.size(); + const int32_t lower_size = lower_files.size(); + int32_t upper_idx = upper_size - 1; + int32_t lower_idx = lower_size - 1; + while (upper_idx >= 0 && lower_idx >= 0) { + int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]); + + if (cmp == 0) { + set_index(&(*index)[upper_idx], lower_idx); + --upper_idx; + --lower_idx; + } else if (cmp < 0) { + // Lower level's file (smallest) is larger, a key won't hit in that + // file. Move to next lower file. + --lower_idx; + } else { + // Lower level's file becomes smaller, update the index, and move to + // the next the upper file + set_index(&(*index)[upper_idx], lower_idx); + --upper_idx; + } + } + while (upper_idx >= 0) { + // Lower files are exhausted, that means the remaining upper files are + // smaller than any lower files. Set it to -1. + set_index(&(*index)[upper_idx], -1); + --upper_idx; + } +} + +} // namespace rocksdb diff --git a/db/file_indexer.h b/db/file_indexer.h new file mode 100644 index 000000000..a42fb0409 --- /dev/null +++ b/db/file_indexer.h @@ -0,0 +1,129 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include + +namespace rocksdb { + +class Comparator; +class FileMetaData; + +// The file tree structure in Version is prebuilt and the range of each file +// is known. On Version::Get(), it uses binary search to find a potential file +// and then check if a target key can be found in the file by comparing the key +// to each file's smallest and largest key. The results of these comparisions +// can be reused beyond checking if a key falls into a file's range. +// With some pre-calculated knowledge, each key comparision that has been done +// can serve as a hint to narrow down further searches: if a key compared to +// be smaller than a file's smallest or largest, that comparison can be used +// to find out the right bound of next binary search. Similarly, if a key +// compared to be larger than a file's smallest or largest, it can be utilized +// to find out the left bound of next binary search. +// With these hints: it can greatly reduce the range of binary search, +// especially for bottom levels, given that one file most likely overlaps with +// only N files from level below (where N is max_bytes_for_level_multiplier). +// So on level L, we will only look at ~N files instead of N^L files on the +// naive approach. +class FileIndexer { + public: + FileIndexer(const uint32_t num_levels, const Comparator* ucmp); + + uint32_t NumLevelIndex(); + + uint32_t LevelIndexSize(uint32_t level); + + // Return a file index range in the next level to search for a key based on + // smallest and largest key comparision for the current file specified by + // level and file_index. When *left_index < *right_index, both index should + // be valid and fit in the vector size. + void GetNextLevelIndex( + const uint32_t level, const uint32_t file_index, const int cmp_smallest, + const int cmp_largest, int32_t* left_bound, int32_t* right_bound); + + void ClearIndex(); + + void UpdateIndex(std::vector* const files); + + enum { + kLevelMaxIndex = std::numeric_limits::max() + }; + + private: + const uint32_t num_levels_; + const Comparator* ucmp_; + + struct IndexUnit { + IndexUnit() + : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {} + // During file search, a key is compared against smallest and largest + // from a FileMetaData. It can have 3 possible outcomes: + // (1) key is smaller than smallest, implying it is also smaller than + // larger. Precalculated index based on "smallest < smallest" can + // be used to provide right bound. + // (2) key is in between smallest and largest. + // Precalculated index based on "smallest > greatest" can be used to + // provide left bound. + // Precalculated index based on "largest < smallest" can be used to + // provide right bound. + // (3) key is larger than largest, implying it is also larger than smallest. + // Precalculated index based on "largest > largest" can be used to + // provide left bound. + // + // As a result, we will need to do: + // Compare smallest (<=) and largest keys from upper level file with + // smallest key from lower level to get a right bound. + // Compare smallest (>=) and largest keys from upper level file with + // largest key from lower level to get a left bound. + // + // Example: + // level 1: [50 - 60] + // level 2: [1 - 40], [45 - 55], [58 - 80] + // A key 35, compared to be less than 50, 3rd file on level 2 can be + // skipped according to rule (1). LB = 0, RB = 1. + // A key 53, sits in the middle 50 and 60. 1st file on level 2 can be + // skipped according to rule (2)-a, but the 3rd file cannot be skipped + // because 60 is greater than 58. LB = 1, RB = 2. + // A key 70, compared to be larger than 60. 1st and 2nd file can be skipped + // according to rule (3). LB = 2, RB = 2. + // + // Point to a left most file in a lower level that may contain a key, + // which compares greater than smallest of a FileMetaData (upper level) + int32_t smallest_lb; + // Point to a left most file in a lower level that may contain a key, + // which compares greater than largest of a FileMetaData (upper level) + int32_t largest_lb; + // Point to a right most file in a lower level that may contain a key, + // which compares smaller than smallest of a FileMetaData (upper level) + int32_t smallest_rb; + // Point to a right most file in a lower level that may contain a key, + // which compares smaller than largest of a FileMetaData (upper level) + int32_t largest_rb; + }; + + void CalculateLB(const std::vector& upper_files, + const std::vector& lower_files, + std::vector* index, + std::function cmp_op, + std::function set_index); + + void CalculateRB(const std::vector& upper_files, + const std::vector& lower_files, + std::vector* index, + std::function cmp_op, + std::function set_index); + + std::vector> next_level_index_; + std::vector level_rb_; +}; + +} // namespace rocksdb diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc new file mode 100644 index 000000000..26a0508a3 --- /dev/null +++ b/db/file_indexer_test.cc @@ -0,0 +1,330 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "db/file_indexer.h" +#include "db/dbformat.h" +#include "db/version_edit.h" +#include "rocksdb/comparator.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class IntComparator : public Comparator { + public: + int Compare(const Slice& a, const Slice& b) const { + assert(a.size() == 8); + assert(b.size() == 8); + return *reinterpret_cast(a.data()) - + *reinterpret_cast(b.data()); + } + + const char* Name() const { + return "IntComparator"; + } + + void FindShortestSeparator(std::string* start, const Slice& limit) const {} + + void FindShortSuccessor(std::string* key) const {} +}; + + +struct FileIndexerTest { + public: + FileIndexerTest() : + kNumLevels(4), indexer(kNumLevels, &ucmp), + files(new std::vector[kNumLevels]) { + } + + ~FileIndexerTest() { + Reset(); + delete[] files; + } + + void AddFile(int level, int64_t smallest, int64_t largest) { + auto* f = new FileMetaData(); + f->smallest = IntKey(smallest); + f->largest = IntKey(largest); + files[level].push_back(f); + } + + InternalKey IntKey(int64_t v) { + return InternalKey(Slice(reinterpret_cast(&v), 8), 0, kTypeValue); + } + + void Reset() { + for (int i = 0; i < kNumLevels; ++i) { + for (auto* f : files[i]) { + delete f; + } + files[i].clear(); + } + indexer.ClearIndex(); + } + + void GetNextLevelIndex(const uint32_t level, const uint32_t file_index, + const int cmp_smallest, const int cmp_largest, int32_t* left_index, + int32_t* right_index) { + *left_index = 100; + *right_index = 100; + indexer.GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest, + left_index, right_index); + } + + const uint32_t kNumLevels; + IntComparator ucmp; + FileIndexer indexer; + + std::vector* files; +}; + +TEST(FileIndexerTest, next_level_hint) { + for (uint32_t i = 0; i < kNumLevels; ++i) { + ASSERT_EQ(0, indexer.LevelIndexSize(i)); + } + + // Case 1: no overlap, files are on the left of next level files + // level 1 + AddFile(1, 100, 200); + AddFile(1, 300, 400); + AddFile(1, 500, 600); + // level 2 + AddFile(2, 1500, 1600); + AddFile(2, 1601, 1699); + AddFile(2, 1700, 1800); + // level 3 + AddFile(3, 2500, 2600); + AddFile(3, 2601, 2699); + AddFile(3, 2700, 2800); + indexer.UpdateIndex(files); + int32_t left = 100; + int32_t right = 100; + for (uint32_t level = 1; level < 3; ++level) { + for (uint32_t f = 0; f < 3; ++f) { + GetNextLevelIndex(level, f, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 0, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 1, 0, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 1, 1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(2, right); + } + } + + // Case 2: no overlap, files are on the right of next level files + Reset(); + for (uint32_t i = 1; i < kNumLevels; ++i) { + ASSERT_EQ(0, indexer.LevelIndexSize(i)); + } + // level 1 + AddFile(1, 2100, 2200); + AddFile(1, 2300, 2400); + AddFile(1, 2500, 2600); + // level 2 + AddFile(2, 1500, 1600); + AddFile(2, 1501, 1699); + AddFile(2, 1700, 1800); + // level 3 + AddFile(3, 500, 600); + AddFile(3, 501, 699); + AddFile(3, 700, 800); + indexer.UpdateIndex(files); + for (uint32_t level = 1; level < 3; ++level) { + for (uint32_t f = 0; f < 3; ++f) { + GetNextLevelIndex(level, f, -1, -1, &left, &right); + ASSERT_EQ(f == 0 ? 0 : 3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 0, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, 0, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, 1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + } + } + + // Case 3: empty L2 + Reset(); + for (uint32_t i = 1; i < kNumLevels; ++i) { + ASSERT_EQ(0, indexer.LevelIndexSize(i)); + } + // level 1 + AddFile(1, 2100, 2200); + AddFile(1, 2300, 2400); + AddFile(1, 2500, 2600); + // level 3 + AddFile(3, 500, 600); + AddFile(3, 501, 699); + AddFile(3, 700, 800); + indexer.UpdateIndex(files); + for (uint32_t f = 0; f < 3; ++f) { + GetNextLevelIndex(1, f, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 0, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, 0, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, 1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + } + + + // Case 4: mixed + Reset(); + for (uint32_t i = 1; i < kNumLevels; ++i) { + ASSERT_EQ(0, indexer.LevelIndexSize(i)); + } + // level 1 + AddFile(1, 100, 200); + AddFile(1, 250, 400); + AddFile(1, 450, 500); + // level 2 + AddFile(2, 100, 150); // 0 + AddFile(2, 200, 250); // 1 + AddFile(2, 251, 300); // 2 + AddFile(2, 301, 350); // 3 + AddFile(2, 500, 600); // 4 + // level 3 + AddFile(3, 0, 50); + AddFile(3, 100, 200); + AddFile(3, 201, 250); + indexer.UpdateIndex(files); + // level 1, 0 + GetNextLevelIndex(1, 0, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(0, right); + GetNextLevelIndex(1, 0, 0, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(0, right); + GetNextLevelIndex(1, 0, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 0, 1, 0, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 0, 1, 1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(4, right); + // level 1, 1 + GetNextLevelIndex(1, 1, -1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 1, 0, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 1, 1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 1, 1, 0, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 1, 1, 1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + // level 1, 2 + GetNextLevelIndex(1, 2, -1, -1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 2, 0, -1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 2, 1, -1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + GetNextLevelIndex(1, 2, 1, 0, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + GetNextLevelIndex(1, 2, 1, 1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + // level 2, 0 + GetNextLevelIndex(2, 0, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 0, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 1, 0, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 1, 1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(2, right); + // level 2, 1 + GetNextLevelIndex(2, 1, -1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 1, 0, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 1, 1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, 1, 1, 0, &left, &right); + ASSERT_EQ(2, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, 1, 1, 1, &left, &right); + ASSERT_EQ(2, left); + ASSERT_EQ(2, right); + // level 2, [2 - 4], no overlap + for (uint32_t f = 2; f <= 4; ++f) { + GetNextLevelIndex(2, f, -1, -1, &left, &right); + ASSERT_EQ(f == 2 ? 2 : 3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 0, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 1, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 1, 0, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 1, 1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/version_set.cc b/db/version_set.cc index 704f2a929..40a096253 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -71,11 +71,11 @@ Version::~Version() { delete[] files_; } -int FindFile(const InternalKeyComparator& icmp, - const std::vector& files, - const Slice& key) { - uint32_t left = 0; - uint32_t right = files.size(); +int FindFileInRange(const InternalKeyComparator& icmp, + const std::vector& files, + const Slice& key, + uint32_t left, + uint32_t right) { while (left < right) { uint32_t mid = (left + right) / 2; const FileMetaData* f = files[mid]; @@ -92,6 +92,12 @@ int FindFile(const InternalKeyComparator& icmp, return right; } +int FindFile(const InternalKeyComparator& icmp, + const std::vector& files, + const Slice& key) { + return FindFileInRange(icmp, files, key, 0, files.size()); +} + static bool AfterFile(const Comparator* ucmp, const Slice* user_key, const FileMetaData* f) { // nullptr user_key occurs before all keys and is therefore never after *f @@ -507,7 +513,10 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset, file_to_compact_level_(-1), compaction_score_(num_levels_), compaction_level_(num_levels_), - version_number_(version_number) {} + version_number_(version_number), + file_indexer_(num_levels_, cfd == nullptr ? nullptr + : cfd->internal_comparator().user_comparator()) { +} void Version::Get(const ReadOptions& options, const LookupKey& k, @@ -538,12 +547,27 @@ void Version::Get(const ReadOptions& options, int last_file_read_level = -1; // We can search level-by-level since entries never hop across - // levels. Therefore we are guaranteed that if we find data + // levels. Therefore we are guaranteed that if we find data // in an smaller level, later levels are irrelevant (unless we // are MergeInProgress). - for (int level = 0; level < num_levels_; level++) { - size_t num_files = files_[level].size(); - if (num_files == 0) continue; + + int32_t search_left_bound = 0; + int32_t search_right_bound = FileIndexer::kLevelMaxIndex; + for (int level = 0; level < num_levels_; ++level) { + int num_files = files_[level].size(); + if (num_files == 0) { + // When current level is empty, the search bound generated from upper + // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is + // also empty. + assert(search_left_bound == 0); + assert(search_right_bound == -1 || + search_right_bound == FileIndexer::kLevelMaxIndex); + // Since current level is empty, it will need to search all files in the + // next level + search_left_bound = 0; + search_right_bound = FileIndexer::kLevelMaxIndex; + continue; + } // Get the list of files to search in this level FileMetaData* const* files = &files_[level][0]; @@ -553,38 +577,65 @@ void Version::Get(const ReadOptions& options, // newest to oldest. In the context of merge-operator, // this can occur at any level. Otherwise, it only occurs // at Level-0 (since Put/Deletes are always compacted into a single entry). - uint32_t start_index; + int32_t start_index; if (level == 0) { // On Level-0, we read through all files to check for overlap. start_index = 0; } else { - // On Level-n (n>=1), files are sorted. - // Binary search to find earliest index whose largest key >= ikey. - // We will also stop when the file no longer overlaps ikey - start_index = FindFile(*internal_comparator_, files_[level], ikey); + // On Level-n (n>=1), files are sorted. Binary search to find the earliest + // file whose largest key >= ikey. Search left bound and right bound are + // used to narrow the range. + if (search_left_bound == search_right_bound) { + start_index = search_left_bound; + } else if (search_left_bound < search_right_bound) { + if (search_right_bound == FileIndexer::kLevelMaxIndex) { + search_right_bound = num_files - 1; + } + start_index = FindFileInRange(cfd_->internal_comparator(), + files_[level], ikey, search_left_bound, search_right_bound); + } else { + // search_left_bound > search_right_bound, key does not exist in this + // level. Since no comparision is done in this level, it will need to + // search all files in the next level. + search_left_bound = 0; + search_right_bound = FileIndexer::kLevelMaxIndex; + continue; + } } - // Traverse each relevant file to find the desired key #ifndef NDEBUG FileMetaData* prev_file = nullptr; #endif - for (uint32_t i = start_index; i < num_files; ++i) { + + for (int32_t i = start_index; i < num_files;) { FileMetaData* f = files[i]; - // Skip key range filtering for levle 0 if there are few level 0 files. - if ((level > 0 || num_files > 2) && - (user_comparator_->Compare(user_key, f->smallest.user_key()) < 0 || - user_comparator_->Compare(user_key, f->largest.user_key()) > 0)) { - // Only process overlapping files. - if (level > 0) { - // If on Level-n (n>=1) then the files are sorted. - // So we can stop looking when we are past the ikey. + // Check if key is within a file's range. If search left bound and right + // bound point to the same find, we are sure key falls in range. + assert(level == 0 || i == start_index || + user_comparator_->Compare(user_key, f->smallest.user_key()) <= 0); + + int cmp_smallest = user_comparator_->Compare(user_key, f->smallest.user_key()); + int cmp_largest = -1; + if (cmp_smallest >= 0) { + cmp_largest = user_comparator_->Compare(user_key, f->largest.user_key()); + } + + // Setup file search bound for the next level based on the comparison + // results + if (level > 0) { + file_indexer_.GetNextLevelIndex(level, i, cmp_smallest, cmp_largest, + &search_left_bound, &search_right_bound); + } + // Key falls out of current file's range + if (cmp_smallest < 0 || cmp_largest > 0) { + if (level == 0) { + ++i; + continue; + } else { break; } - // TODO: do we want to check file ranges for level0 files at all? - // For new SST format where Get() is fast, we might want to consider - // to avoid those two comparisons, if it can filter out too few files. - continue; } + #ifndef NDEBUG // Sanity check to make sure that the files are correctly sorted if (prev_file) { @@ -643,6 +694,11 @@ void Version::Get(const ReadOptions& options, case kMerge: break; } + if (level > 0 && cmp_largest < 0) { + break; + } else { + ++i; + } } } @@ -1454,6 +1510,8 @@ class VersionSet::Builder { } CheckConsistency(v); + + v->file_indexer_.UpdateIndex(v->files_); } void LoadTableHandlers() { diff --git a/db/version_set.h b/db/version_set.h index ef616f34b..8076e6bc6 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -33,6 +33,7 @@ #include "db/compaction_picker.h" #include "db/column_family.h" #include "db/log_reader.h" +#include "db/file_indexer.h" namespace rocksdb { @@ -281,6 +282,7 @@ class Version { uint64_t version_number_; Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0); + FileIndexer file_indexer_; ~Version(); diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 0b4938f25..c7837c38b 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -59,6 +59,7 @@ static bool ValidateUint32Range(const char* flagname, uint64_t value) { } return true; } + DEFINE_uint64(seed, 2341234, "Seed for PRNG"); static const bool FLAGS_seed_dummy __attribute__((unused)) = google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range); @@ -377,6 +378,17 @@ static std::string Key(long val) { return big_endian_key; } +static std::string StringToHex(const std::string& str) { + std::string result = "0x"; + char buf[10]; + for (size_t i = 0; i < str.length(); i++) { + snprintf(buf, 10, "%02X", (unsigned char)str[i]); + result += buf; + } + return result; +} + + class StressTest; namespace { @@ -953,8 +965,8 @@ class StressTest { for (int i = 1; i < 10; i++) { if (values[i] != values[0]) { fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n", - key.ToString().c_str(), values[0].c_str(), - values[i].c_str()); + key.ToString(true).c_str(), StringToHex(values[0]).c_str(), + StringToHex(values[i]).c_str()); // we continue after error rather than exiting so that we can // find more errors if any } @@ -1013,9 +1025,9 @@ class StressTest { // make sure all values are equivalent for (int i = 0; i < 10; i++) { if (values[i] != values[0]) { - fprintf(stderr, "error : inconsistent values for prefix %s: %s, %s\n", - prefixes[i].c_str(), values[0].c_str(), - values[i].c_str()); + fprintf(stderr, "error : %d, inconsistent values for prefix %s: %s, %s\n", + i, prefixes[i].c_str(), StringToHex(values[0]).c_str(), + StringToHex(values[i]).c_str()); // we continue after error rather than exiting so that we can // find more errors if any }