[Performance Branch] Fix a bug of PlainTable when building indexes

Summary: PlainTable now has a bug of the ordering of indexes for the prefixes in the same bucket. I thought std::map guaranteed key order but it didn't, probably because I didn't use it properly. But seems to me that we don't need to make extra sorting as input prefixes are already sorted. Found by problem by running leaf4 against plain table. Replace the map with a vector. It should performs better too. After the fix, leaf4 unit tests are passing. Test Plan: run plain_table_db_test Also going to run db_test with plain table in the uncommitted branch. Reviewers: haobo, kailiu Reviewed By: haobo CC: nkg-, leveldb Differential Revision: https://reviews.facebook.net/D14649
12 years ago · 9718c790ec
parent 0cd1521af5
commit 9718c790ec
1 changed files with 15 additions and 15 deletions
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@ -37,14 +37,6 @@ public:
    return MurmurHash(s.data(), s.size(), 397);
  }
 };
-
-class slice_comparator {
-public:
-  bool operator()(rocksdb::Slice const& s1,
-                  rocksdb::Slice const& s2) const {
-    return s1.compare(s2) < 0;
-  }
-};
 }

 namespace rocksdb {
@ -146,8 +138,8 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {
  HistogramImpl keys_per_prefix_hist;
  // Need map to be ordered to make sure sub indexes generated
  // are in order.
-  std::map<Slice, std::string, std::slice_comparator> prefix2map;
-
+  std::vector<std::pair<Slice, std::string>> prefix_index_pairs;
+  std::string current_prefix_index;
  while (pos < file_size) {
    uint32_t key_offset = pos;
    status_ = Next(pos, &key_slice, &value_slice, pos);
@ -156,6 +148,11 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {
    if (first || prev_key_prefix_slice != key_prefix_slice) {
      if (!first) {
        keys_per_prefix_hist.Add(key_index_within_prefix);
+        prefix_index_pairs.push_back(
+            std::make_pair<Slice, std::string>(
+                std::move(prev_key_prefix_slice),
+                std::move(current_prefix_index)));
+        current_prefix_index.clear();
      }
      key_index_within_prefix = 0;
      prev_key_prefix_slice = key_prefix_slice;
@ -163,27 +160,30 @@ Status PlainTableReader::PopulateIndex(uint64_t file_size) {

    if (key_index_within_prefix++ % 8 == 0) {
      // Add an index key for every 8 keys
-      std::string& prefix_index = prefix2map[key_prefix_slice];
-      PutFixed32(&prefix_index, key_offset);
+      PutFixed32(&current_prefix_index, key_offset);
    }
    first = false;
  }
+  prefix_index_pairs.push_back(
+      std::make_pair<Slice, std::string>(std::move(prev_key_prefix_slice),
+                                         std::move(current_prefix_index)));
+
  keys_per_prefix_hist.Add(key_index_within_prefix);
  if (hash_table_ != nullptr) {
    delete[] hash_table_;
  }
  std::vector<Slice> filter_entries(0); // for creating bloom filter;
  if (filter_policy_ != nullptr) {
-    filter_entries.reserve(prefix2map.size());
+    filter_entries.reserve(prefix_index_pairs.size());
  }
  double hash_table_size_multipier =
      (hash_table_ratio_ > 1.0) ? 1.0 : 1.0 / hash_table_ratio_;
-  hash_table_size_ = prefix2map.size() * hash_table_size_multipier + 1;
+  hash_table_size_ = prefix_index_pairs.size() * hash_table_size_multipier + 1;
  hash_table_ = new uint32_t[hash_table_size_];
  std::vector<std::string> hash2map(hash_table_size_);

  size_t sub_index_size_needed = 0;
-  for (auto& p: prefix2map) {
+  for (auto& p: prefix_index_pairs) {
    auto& sub_index = hash2map[getBucketId(p.first, key_prefix_len_,
                                           hash_table_size_)];
    if (sub_index.length() > 0 || p.second.length() > kOffsetLen) {