Remove seek compaction

Summary: As discussed in our internal group, we don't get much use of seek compaction at the moment, while it's making code more complicated and slower in some cases. This diff removes seek compaction and (hopefully) all code that was introduced to support seek compaction. There is one test case that relied on didIO information. I'll try to find another way to implement it. Test Plan: make check Reviewers: sdong, haobo, yhchiang, ljin, dhruba Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D19161
12 years ago · d4a8423334
parent 107e08baa7
commit d4a8423334
26 changed files with 101 additions and 343 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -1,5 +1,14 @@
 # Rocksdb Change Log
 ## Unreleased (will be released with 3.2.0)
 ### Public API changes
 * We removed seek compaction as a concept from RocksDB because:
 1) It makes more sense for spinning disk workloads, while RocksDB is primarily designed for flash and memory,
 2) It added some complexity to the important code-paths,
 3) None of our internal customers were really using it.
 Because of that, Options::disable_seek_compaction is now obsolete. It is still a parameter in Options, so it does not break the build, but it does not have any effect. We plan to completely remove it at some point, so we ask users to please remove this option from your code base.
 ## 3.1.0 (05/21/2014)
 ### Public API changes
--- a/build_tools/regression_build_test.sh
+++ b/build_tools/regression_build_test.sh
@ -109,7 +109,6 @@ make release
    --cache_numshardbits=6 \
    --table_cache_numshardbits=4 \
    --open_files=55000 \
    --disable_seek_compaction=1 \
    --statistics=1 \
    --histogram=1 \
    --disable_data_sync=1 \
@ -129,7 +128,6 @@ make release
    --cache_numshardbits=6 \
    --table_cache_numshardbits=4 \
    --open_files=55000 \
    --disable_seek_compaction=1 \
    --use_tailing_iterator=1 \
    --statistics=1 \
    --histogram=1 \
@ -150,7 +148,6 @@ make release
    --cache_numshardbits=6 \
    --table_cache_numshardbits=4 \
    --open_files=55000 \
    --disable_seek_compaction=1 \
    --statistics=1 \
    --histogram=1 \
    --disable_data_sync=1 \
@ -172,7 +169,6 @@ make release
    --table_cache_numshardbits=4 \
    --write_buffer_size=1000000000 \
    --open_files=55000 \
    --disable_seek_compaction=1 \
    --statistics=1 \
    --histogram=1 \
    --disable_data_sync=1 \
@ -231,7 +227,6 @@ make release
    --cache_numshardbits=6 \
    --table_cache_numshardbits=4 \
    --open_files=55000 \
    --disable_seek_compaction=1 \
    --disable_auto_compactions=1 \
    --statistics=1 \
    --histogram=1 \
@ -254,7 +249,6 @@ make release
    --cache_numshardbits=6 \
    --table_cache_numshardbits=4 \
    --open_files=55000 \
    --disable_seek_compaction=1 \
    --statistics=1 \
    --histogram=1 \
    --disable_data_sync=1 \
@ -274,7 +268,6 @@ make release
    --table_cache_numshardbits=4 \
    --write_buffer_size=1000000000 \
    --open_files=55000 \
    --disable_seek_compaction=1 \
    --statistics=1 \
    --histogram=1 \
    --disable_data_sync=1 \
@ -291,7 +284,6 @@ common_in_mem_args="--db=/dev/shm/rocksdb \
    --value_size=100 \
    --compression_type=none \
    --compression_ratio=1 \
    --disable_seek_compaction=1 \
    --hard_rate_limit=2 \
    --write_buffer_size=134217728 \
    --max_write_buffer_number=4 \
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@ -749,7 +749,6 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) {
  default_cf.filter_policy = nullptr;
  default_cf.no_block_cache = true;
  default_cf.source_compaction_factor = 100;
  default_cf.disable_seek_compaction = false;
  one.compaction_style = kCompactionStyleUniversal;
  // trigger compaction if there are >= 4 files
@ -764,33 +763,6 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) {
  Reopen({default_cf, one, two});
  // SETUP column family "default" - test read compaction
  ASSERT_EQ("", FilesPerLevel(0));
  PutRandomData(0, 1, 4096);
  ASSERT_OK(Flush(0));
  ASSERT_EQ("0,0,1", FilesPerLevel(0));
  // write 8MB
  PutRandomData(0, 2000, 4096);
  ASSERT_OK(Flush(0));
  // clear levels 0 and 1
  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[0]);
  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[0]);
  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);
  // write some new keys into level 0 and 1
  PutRandomData(0, 1024, 512);
  ASSERT_OK(Flush(0));
  WaitForCompaction();
  PutRandomData(0, 10, 512);
  ASSERT_OK(Flush(0));
  // remember number of files in each level
  int l1 = NumTableFilesAtLevel(0, 0);
  int l2 = NumTableFilesAtLevel(1, 0);
  int l3 = NumTableFilesAtLevel(2, 0);
  ASSERT_NE(l1, 0);
  ASSERT_NE(l2, 0);
  ASSERT_NE(l3, 0);
  // SETUP column family "one" -- universal style
  for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
    PutRandomData(1, 11, 10000);
@ -805,12 +777,6 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) {
    ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2));
  }
  // TRIGGER compaction "default"
  // read a bunch of times, trigger read compaction
  for (int i = 0; i < 200000; ++i) {
    Get(0, std::to_string(i));
  }
  // TRIGGER compaction "one"
  PutRandomData(1, 12, 10000);
@ -820,13 +786,6 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) {
  // WAIT for compactions
  WaitForCompaction();
  // VERIFY compaction "default"
  // verify that the number of files have decreased
  // in some level, indicating that there was a compaction
  ASSERT_TRUE(NumTableFilesAtLevel(0, 0) < l1 ||
              NumTableFilesAtLevel(1, 0) < l2 ||
              NumTableFilesAtLevel(2, 0) < l3);
  // VERIFY compaction "one"
  ASSERT_EQ("1", FilesPerLevel(1));
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@ -407,31 +407,6 @@ Compaction* LevelCompactionPicker::PickCompaction(Version* version,
    }
  }
  // Find compactions needed by seeks
  FileMetaData* f = version->file_to_compact_;
  if (c == nullptr && f != nullptr && !f->being_compacted) {
    level = version->file_to_compact_level_;
    int parent_index = -1;
    // Only allow one level 0 compaction at a time.
    // Do not pick this file if its parents at level+1 are being compacted.
    if (level != 0 || compactions_in_progress_[0].empty()) {
      if (!ParentRangeInCompaction(version, &f->smallest, &f->largest, level,
                                   &parent_index)) {
        c = new Compaction(version, level, level + 1,
                           MaxFileSizeForLevel(level + 1),
                           MaxGrandParentOverlapBytes(level), true);
        c->inputs_[0].push_back(f);
        c->parent_index_ = parent_index;
        c->input_version_->file_to_compact_ = nullptr;
        if (ExpandWhileOverlapping(c) == false) {
          return nullptr;
        }
      }
    }
  }
  if (c == nullptr) {
    return nullptr;
  }
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@ -333,9 +333,6 @@ DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
             "deletepercent), so deletepercent must be smaller than (100 - "
             "FLAGS_readwritepercent)");
 DEFINE_int32(disable_seek_compaction, false, "Option to disable compaction"
             " triggered by read.");
 DEFINE_uint64(delete_obsolete_files_period_micros, 0, "Option to delete "
              "obsolete files periodically. 0 means that obsolete files are"
              " deleted after every compaction run.");
@ -553,6 +550,9 @@ static const bool FLAGS_cache_numshardbits_dummy __attribute__((unused)) =
 static const bool FLAGS_readwritepercent_dummy __attribute__((unused)) =
    RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
 DEFINE_int32(disable_seek_compaction, false,
             "Not used, left here for backwards compatibility");
 static const bool FLAGS_deletepercent_dummy __attribute__((unused)) =
    RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
 static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) =
@ -1663,7 +1663,6 @@ class Benchmark {
        options.compression_per_level[i] = FLAGS_compression_type_e;
      }
    }
    options.disable_seek_compaction = FLAGS_disable_seek_compaction;
    options.delete_obsolete_files_period_micros =
      FLAGS_delete_obsolete_files_period_micros;
    options.soft_rate_limit = FLAGS_soft_rate_limit;
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@ -3276,9 +3276,6 @@ Status DBImpl::GetImpl(const ReadOptions& options,
    mutex_.Unlock();
  }
  bool have_stat_update = false;
  Version::GetStats stats;
  // Prepare to store a list of merge operations if merge occurs.
  MergeContext merge_context;
@ -3297,23 +3294,13 @@ Status DBImpl::GetImpl(const ReadOptions& options,
  } else {
    PERF_TIMER_START(get_from_output_files_time);
-    sv->current->Get(options, lkey, value, &s, &merge_context, &stats,
+    sv->current->Get(options, lkey, value, &s, &merge_context, value_found);
                     value_found);
    have_stat_update = true;
    PERF_TIMER_STOP(get_from_output_files_time);
    RecordTick(options_.statistics.get(), MEMTABLE_MISS);
  }
  PERF_TIMER_START(get_post_process_time);
  if (!cfd->options()->disable_seek_compaction && have_stat_update) {
    mutex_.Lock();
    if (sv->current->UpdateStats(stats)) {
      MaybeScheduleFlushOrCompaction();
    }
    mutex_.Unlock();
  }
  bool unref_sv = true;
  if (LIKELY(options_.allow_thread_local)) {
    unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv);
@ -3350,8 +3337,6 @@ std::vector<Status> DBImpl::MultiGet(
  struct MultiGetColumnFamilyData {
    ColumnFamilyData* cfd;
    SuperVersion* super_version;
    Version::GetStats stats;
    bool have_stat_update = false;
  };
  std::unordered_map<uint32_t, MultiGetColumnFamilyData*> multiget_cf_data;
  // fill up and allocate outside of mutex
@ -3412,9 +3397,7 @@ std::vector<Status> DBImpl::MultiGet(
                                       *cfd->options())) {
      // Done
    } else {
-      super_version->current->Get(options, lkey, value, &s, &merge_context,
+      super_version->current->Get(options, lkey, value, &s, &merge_context);
                                  &mgd->stats);
      mgd->have_stat_update = true;
    }
    if (s.ok()) {
@ -3426,24 +3409,15 @@ std::vector<Status> DBImpl::MultiGet(
  PERF_TIMER_START(get_post_process_time);
  autovector<SuperVersion*> superversions_to_delete;
-  bool schedule_flush_or_compaction = false;
+  // TODO(icanadi) do we need lock here or just around Cleanup()?
  mutex_.Lock();
  for (auto mgd_iter : multiget_cf_data) {
    auto mgd = mgd_iter.second;
    auto cfd = mgd->cfd;
    if (!cfd->options()->disable_seek_compaction && mgd->have_stat_update) {
      if (mgd->super_version->current->UpdateStats(mgd->stats)) {
        schedule_flush_or_compaction = true;
      }
    }
    if (mgd->super_version->Unref()) {
      mgd->super_version->Cleanup();
      superversions_to_delete.push_back(mgd->super_version);
    }
  }
  if (schedule_flush_or_compaction) {
    MaybeScheduleFlushOrCompaction();
  }
  mutex_.Unlock();
  for (auto td : superversions_to_delete) {
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@ -65,9 +65,7 @@ Status DBImplReadOnly::Get(const ReadOptions& options,
  if (super_version->mem->Get(lkey, value, &s, merge_context,
                              *cfd->options())) {
  } else {
-    Version::GetStats stats;
+    super_version->current->Get(options, lkey, value, &s, &merge_context);
    super_version->current->Get(options, lkey, value, &s, &merge_context,
                                &stats);
  }
  return s;
 }
--- a/db/db_test.cc
+++ b/db/db_test.cc
@ -5869,69 +5869,6 @@ TEST(DBTest, ReadFirstRecordCache) {
  ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
 }
 TEST(DBTest, ReadCompaction) {
  std::string value(4096, '4'); // a string of size 4K
  {
    Options options = CurrentOptions();
    options.create_if_missing = true;
    options.max_open_files = 20; // only 10 file in file-cache
    options.target_file_size_base = 512;
    options.write_buffer_size = 64 * 1024;
    options.filter_policy = nullptr;
    options.block_size = 4096;
    options.no_block_cache = true;
    options.disable_seek_compaction = false;
    CreateAndReopenWithCF({"pikachu"}, &options);
    // Write 8MB (2000 values, each 4K)
    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
    std::vector<std::string> values;
    for (int i = 0; i < 2000; i++) {
      ASSERT_OK(Put(1, Key(i), value));
    }
    // clear level 0 and 1 if necessary.
    Flush(1);
    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
    ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
    // write some new keys into level 0
    for (int i = 0; i < 2000; i = i + 16) {
      ASSERT_OK(Put(1, Key(i), value));
    }
    Flush(1);
    // Wait for any write compaction to finish
    dbfull()->TEST_WaitForCompact();
    // remember number of files in each level
    int l1 = NumTableFilesAtLevel(0, 1);
    int l2 = NumTableFilesAtLevel(1, 1);
    int l3 = NumTableFilesAtLevel(2, 1);
    ASSERT_NE(NumTableFilesAtLevel(0, 1), 0);
    ASSERT_NE(NumTableFilesAtLevel(1, 1), 0);
    ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
    // read a bunch of times, trigger read compaction
    for (int j = 0; j < 100; j++) {
      for (int i = 0; i < 2000; i++) {
        Get(1, Key(i));
      }
    }
    // wait for read compaction to finish
    env_->SleepForMicroseconds(1000000);
    // verify that the number of files have decreased
    // in some level, indicating that there was a compaction
    ASSERT_TRUE(NumTableFilesAtLevel(0, 1) < l1 ||
                NumTableFilesAtLevel(1, 1) < l2 ||
                NumTableFilesAtLevel(2, 1) < l3);
  }
 }
 // Multi-threaded test:
 namespace {
@ -6641,7 +6578,6 @@ TEST(DBTest, PrefixScan) {
  options.disable_auto_compactions = true;
  options.max_background_compactions = 2;
  options.create_if_missing = true;
  options.disable_seek_compaction = true;
  options.memtable_factory.reset(NewHashSkipListRepFactory());
  // 11 RAND I/Os
--- a/db/simple_table_db_test.cc
+++ b/db/simple_table_db_test.cc
@ -87,7 +87,7 @@ public:
  Status Get(const ReadOptions&, const Slice& key, void* arg,
             bool (*handle_result)(void* arg, const ParsedInternalKey& k,
-                                   const Slice& v, bool),
+                                   const Slice& v),
             void (*mark_key_may_exist)(void*) = nullptr) override;
  uint64_t ApproximateOffsetOf(const Slice& key) override;
@ -285,7 +285,7 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
 Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
                              void* arg,
                              bool (*saver)(void*, const ParsedInternalKey&,
-                                            const Slice&, bool),
+                                            const Slice&),
                              void (*mark_key_may_exist)(void*)) {
  Status s;
  SimpleTableIterator* iter = new SimpleTableIterator(this);
@ -295,7 +295,7 @@ Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
      return Status::Corruption(Slice());
    }
-    if (!(*saver)(arg, parsed_key, iter->value(), true)) {
+    if (!(*saver)(arg, parsed_key, iter->value())) {
      break;
    }
  }
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@ -58,7 +58,7 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) {
 Status TableCache::FindTable(const EnvOptions& toptions,
                             const InternalKeyComparator& internal_comparator,
                             const FileDescriptor& fd, Cache::Handle** handle,
-                             bool* table_io, const bool no_io) {
+                             const bool no_io) {
  Status s;
  Slice key = GetSliceForFileNumber(&fd.number);
  *handle = cache_->Lookup(key);
@ -66,9 +66,6 @@ Status TableCache::FindTable(const EnvOptions& toptions,
    if (no_io) { // Dont do IO and return a not-found status
      return Status::Incomplete("Table not found in table_cache, no_io is set");
    }
    if (table_io != nullptr) {
      *table_io = true;    // we had to do IO from storage
    }
    std::string fname = TableFileName(dbname_, fd.GetNumber());
    unique_ptr<RandomAccessFile> file;
    unique_ptr<TableReader> table_reader;
@ -110,7 +107,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
  Cache::Handle* handle = nullptr;
  Status s;
  if (table_reader == nullptr) {
-    s = FindTable(toptions, icomparator, fd, &handle, nullptr,
+    s = FindTable(toptions, icomparator, fd, &handle,
                  options.read_tier == kBlockCacheTier);
    if (!s.ok()) {
      return NewErrorIterator(s, arena);
@ -137,13 +134,13 @@ Status TableCache::Get(const ReadOptions& options,
                       const InternalKeyComparator& internal_comparator,
                       const FileDescriptor& fd, const Slice& k, void* arg,
                       bool (*saver)(void*, const ParsedInternalKey&,
-                                     const Slice&, bool),
+                                     const Slice&),
-                       bool* table_io, void (*mark_key_may_exist)(void*)) {
+                       void (*mark_key_may_exist)(void*)) {
  TableReader* t = fd.table_reader;
  Status s;
  Cache::Handle* handle = nullptr;
  if (!t) {
-    s = FindTable(storage_options_, internal_comparator, fd, &handle, table_io,
+    s = FindTable(storage_options_, internal_comparator, fd, &handle,
                  options.read_tier == kBlockCacheTier);
    if (s.ok()) {
      t = GetTableReaderFromHandle(handle);
@ -174,10 +171,8 @@ Status TableCache::GetTableProperties(
    return s;
  }
  bool table_io;
  Cache::Handle* table_handle = nullptr;
-  s = FindTable(toptions, internal_comparator, fd, &table_handle, &table_io,
+  s = FindTable(toptions, internal_comparator, fd, &table_handle, no_io);
                no_io);
  if (!s.ok()) {
    return s;
  }
--- a/db/table_cache.h
+++ b/db/table_cache.h
@ -52,8 +52,8 @@ class TableCache {
             const InternalKeyComparator& internal_comparator,
             const FileDescriptor& file_fd, const Slice& k, void* arg,
             bool (*handle_result)(void*, const ParsedInternalKey&,
-                                   const Slice&, bool),
+                                   const Slice&),
-             bool* table_io, void (*mark_key_may_exist)(void*) = nullptr);
+             void (*mark_key_may_exist)(void*) = nullptr);
  // Evict any entry for the specified file number
  static void Evict(Cache* cache, uint64_t file_number);
@ -62,7 +62,7 @@ class TableCache {
  Status FindTable(const EnvOptions& toptions,
                   const InternalKeyComparator& internal_comparator,
                   const FileDescriptor& file_fd, Cache::Handle**,
-                   bool* table_io = nullptr, const bool no_io = false);
+                   const bool no_io = false);
  // Get TableReader from a cache handle.
  TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
--- a/db/version_edit.h
+++ b/db/version_edit.h
@ -40,7 +40,6 @@ struct FileDescriptor {
 struct FileMetaData {
  int refs;
  FileDescriptor fd;
  int allowed_seeks;          // Seeks allowed until compaction
  InternalKey smallest;       // Smallest internal key served by table
  InternalKey largest;        // Largest internal key served by table
  bool being_compacted;       // Is this file undergoing compaction?
@ -53,7 +52,6 @@ struct FileMetaData {
  FileMetaData()
      : refs(0),
        fd(0, 0),
        allowed_seeks(1 << 30),
        being_compacted(false),
        table_reader_handle(nullptr) {}
 };
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -362,7 +362,6 @@ struct Saver {
  // the merge operations encountered;
  MergeContext* merge_context;
  Logger* logger;
  bool didIO;    // did we do any disk io?
  Statistics* statistics;
 };
 }
@ -381,15 +380,14 @@ static void MarkKeyMayExist(void* arg) {
 }
 static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
-                      const Slice& v, bool didIO) {
+                      const Slice& v) {
  Saver* s = reinterpret_cast<Saver*>(arg);
  MergeContext* merge_contex = s->merge_context;
  std::string merge_result;  // temporary area for merge results later
  assert(s != nullptr && merge_contex != nullptr);
-  // TODO: didIO and Merge?
+  // TODO: Merge?
  s->didIO = didIO;
  if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
    // Key matches. Process it
    switch (parsed_key.type) {
@ -490,8 +488,6 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
      files_(new std::vector<FileMetaData*>[num_levels_]),
      files_by_size_(num_levels_),
      next_file_to_compact_by_size_(num_levels_),
      file_to_compact_(nullptr),
      file_to_compact_level_(-1),
      compaction_score_(num_levels_),
      compaction_level_(num_levels_),
      version_number_(version_number),
@ -504,7 +500,6 @@ void Version::Get(const ReadOptions& options,
                  std::string* value,
                  Status* status,
                  MergeContext* merge_context,
                  GetStats* stats,
                  bool* value_found) {
  Slice ikey = k.internal_key();
  Slice user_key = k.user_key();
@ -519,14 +514,8 @@ void Version::Get(const ReadOptions& options,
  saver.merge_operator = merge_operator_;
  saver.merge_context = merge_context;
  saver.logger = info_log_;
  saver.didIO = false;
  saver.statistics = db_statistics_;
  stats->seek_file = nullptr;
  stats->seek_file_level = -1;
  FileMetaData* last_file_read = nullptr;
  int last_file_read_level = -1;
  // We can search level-by-level since entries never hop across
  // levels. Therefore we are guaranteed that if we find data
  // in an smaller level, later levels are irrelevant (unless we
@ -657,32 +646,13 @@ void Version::Get(const ReadOptions& options,
      }
      prev_file = f;
 #endif
      bool tableIO = false;
      *status = table_cache_->Get(options, *internal_comparator_, f->fd, ikey,
-                                  &saver, SaveValue, &tableIO, MarkKeyMayExist);
+                                  &saver, SaveValue, MarkKeyMayExist);
      // TODO: examine the behavior for corrupted key
      if (!status->ok()) {
        return;
      }
      if (last_file_read != nullptr && stats->seek_file == nullptr) {
        // We have had more than one seek for this read.  Charge the 1st file.
        stats->seek_file = last_file_read;
        stats->seek_file_level = last_file_read_level;
      }
      // If we did any IO as part of the read, then we remember it because
      // it is a possible candidate for seek-based compaction. saver.didIO
      // is true if the block had to be read in from storage and was not
      // pre-exisiting in the block cache. Also, if this file was not pre-
      // existing in the table cache and had to be freshly opened that needed
      // the index blocks to be read-in, then tableIO is true. One thing
      // to note is that the index blocks are not part of the block cache.
      if (saver.didIO || tableIO) {
        last_file_read = f;
        last_file_read_level = level;
      }
      switch (saver.state) {
        case kNotFound:
          break;      // Keep searching in other files
@ -723,19 +693,6 @@ void Version::Get(const ReadOptions& options,
  }
 }
 bool Version::UpdateStats(const GetStats& stats) {
  FileMetaData* f = stats.seek_file;
  if (f != nullptr) {
    f->allowed_seeks--;
    if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) {
      file_to_compact_ = f;
      file_to_compact_level_ = stats.seek_file_level;
      return true;
    }
  }
  return false;
 }
 void Version::PrepareApply(std::vector<uint64_t>& size_being_compacted) {
  ComputeCompactionScore(size_being_compacted);
  UpdateFilesBySize();
@ -917,9 +874,6 @@ bool Version::Unref() {
 }
 bool Version::NeedsCompaction() const {
  if (file_to_compact_ != nullptr) {
    return true;
  }
  // In universal compaction case, this check doesn't really
  // check the compaction condition, but checks num of files threshold
  // only. We are not going to miss any compaction opportunity
@ -1477,22 +1431,6 @@ class VersionSet::Builder {
      FileMetaData* f = new FileMetaData(new_file.second);
      f->refs = 1;
      // We arrange to automatically compact this file after
      // a certain number of seeks.  Let's assume:
      //   (1) One seek costs 10ms
      //   (2) Writing or reading 1MB costs 10ms (100MB/s)
      //   (3) A compaction of 1MB does 25MB of IO:
      //         1MB read from this level
      //         10-12MB read from next level (boundaries may be misaligned)
      //         10-12MB written to next level
      // This implies that 25 seeks cost the same as the compaction
      // of 1MB of data.  I.e., one seek costs approximately the
      // same as the compaction of 40KB of data.  We are a little
      // conservative and allow approximately one seek for every 16KB
      // of data before triggering a compaction.
      f->allowed_seeks = (f->fd.GetFileSize() / 16384);
      if (f->allowed_seeks < 100) f->allowed_seeks = 100;
      levels_[level].deleted_files.erase(f->fd.GetNumber());
      levels_[level].added_files->insert(f);
    }
@ -1539,10 +1477,9 @@ class VersionSet::Builder {
    for (int level = 0; level < cfd_->NumberLevels(); level++) {
      for (auto& file_meta : *(levels_[level].added_files)) {
        assert (!file_meta->table_reader_handle);
        bool table_io;
        cfd_->table_cache()->FindTable(
            base_->vset_->storage_options_, cfd_->internal_comparator(),
-            file_meta->fd, &file_meta->table_reader_handle, &table_io, false);
+            file_meta->fd, &file_meta->table_reader_handle, false);
        if (file_meta->table_reader_handle != nullptr) {
          // Load table_reader
          file_meta->fd.table_reader =
--- a/db/version_set.h
+++ b/db/version_set.h
@ -85,23 +85,13 @@ class Version {
                    MergeIteratorBuilder* merger_iter_builder);
  // Lookup the value for key.  If found, store it in *val and
-  // return OK.  Else return a non-OK status.  Fills *stats.
+  // return OK.  Else return a non-OK status.
  // Uses *operands to store merge_operator operations to apply later
  // REQUIRES: lock is not held
  struct GetStats {
    FileMetaData* seek_file;
    int seek_file_level;
  };
  void Get(const ReadOptions&, const LookupKey& key, std::string* val,
-           Status* status, MergeContext* merge_context, GetStats* stats,
+           Status* status, MergeContext* merge_context,
           bool* value_found = nullptr);
  // Adds "stats" into the current state.  Returns true if a new
  // compaction may need to be triggered, false otherwise.
  // REQUIRES: lock is held
  bool UpdateStats(const GetStats& stats);
  // Updates internal structures that keep track of compaction scores
  // We use compaction scores to figure out which compaction to do next
  // REQUIRES: If Version is not yet saved to current_, it can be called without
@ -278,10 +268,6 @@ class Version {
  // seconds/minutes (because of concurrent compactions).
  static const int number_of_files_to_sort_ = 50;
  // Next file to compact based on seek stats.
  FileMetaData* file_to_compact_;
  int file_to_compact_level_;
  // Level that should be compacted next and its compaction score.
  // Score < 1 means compaction is not strictly needed.  These fields
  // are initialized by Finalize().
--- a/doc/index.html
+++ b/doc/index.html
@ -642,10 +642,6 @@ Default:1, i.e. pick maxfilesize amount of data as the source of a compaction.
 <li> <code>Options::max_grandparent_overlap_factor</code> -   Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
 stop building a single file in a level->level+1 compaction.
 <p>
 <li> <code>Options::disable_seek_compaction</code> -  Disable compaction triggered by seek.
 With bloomfilter and fast storage, a miss on one level is very cheap if the file handle is cached in table cache
 (which is true if max_open_files is large).
 <p>
 <li> <code>Options::max_background_compactions</code> - Maximum number of concurrent background jobs, submitted to
 the default LOW priority thread pool
 </ul>
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -368,10 +368,15 @@ struct ColumnFamilyOptions {
  // stop building a single file in a level->level+1 compaction.
  int max_grandparent_overlap_factor;
-  // Disable compaction triggered by seek.
+  // We decided to remove seek compaction from RocksDB because:
-  // With bloomfilter and fast storage, a miss on one level
+  // 1) It makes more sense for spinning disk workloads, while RocksDB is
-  // is very cheap if the file handle is cached in table cache
+  // primarily designed for flash and memory,
-  // (which is true if max_open_files is large).
+  // 2) It added some complexity to the important code-paths,
  // 3) None of our internal customers were really using it.
  //
  // Since we removed seek compaction, this option is now obsolete.
  // We left it here for backwards compatiblity (otherwise it would break the
  // build), but we'll remove it at some point.
  // Default: true
  bool disable_seek_compaction;
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@ -56,13 +56,11 @@ const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) =
 // Read the block identified by "handle" from "file".
 // The only relevant option is options.verify_checksums for now.
 // Set *didIO to true if didIO is not null.
 // On failure return non-OK.
 // On success fill *result and return OK - caller owns *result
 Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer,
                         const ReadOptions& options, const BlockHandle& handle,
-                         Block** result, Env* env, bool* didIO = nullptr,
+                         Block** result, Env* env, bool do_uncompress = true) {
                         bool do_uncompress = true) {
  BlockContents contents;
  Status s = ReadBlockContents(file, footer, options, handle, &contents, env,
                               do_uncompress);
@ -70,9 +68,6 @@ Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer,
    *result = new Block(contents);
  }
  if (didIO != nullptr) {
    *didIO = true;
  }
  return s;
 }
@ -834,7 +829,7 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) {
 // Convert an index iterator value (i.e., an encoded BlockHandle)
 // into an iterator over the contents of the corresponding block.
 Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
-    const ReadOptions& ro, bool* didIO, const Slice& index_value) {
+    const ReadOptions& ro, const Slice& index_value) {
  const bool no_io = (ro.read_tier == kBlockCacheTier);
  Cache* block_cache = rep->options.block_cache.get();
  Cache* block_cache_compressed = rep->options.
@ -872,7 +867,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
    }
    s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
-                      statistics, ro, &block);
+                              statistics, ro, &block);
    if (block.value == nullptr && !no_io && ro.fill_cache) {
      Histograms histogram = READ_BLOCK_GET_MICROS;
@ -880,7 +875,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
      {
        StopWatch sw(rep->options.env, statistics, histogram);
        s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
-                              &raw_block, rep->options.env, didIO,
+                              &raw_block, rep->options.env,
                              block_cache_compressed == nullptr);
      }
@ -898,7 +893,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
      return NewErrorIterator(Status::Incomplete("no blocking io"));
    }
    s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
-                          &block.value, rep->options.env, didIO);
+                          &block.value, rep->options.env);
  }
  Iterator* iter;
@ -919,13 +914,13 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
 class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
 public:
  BlockEntryIteratorState(BlockBasedTable* table,
-      const ReadOptions& read_options, bool* did_io)
+                          const ReadOptions& read_options)
-    : TwoLevelIteratorState(table->rep_->options.prefix_extractor != nullptr),
+      : TwoLevelIteratorState(table->rep_->options.prefix_extractor != nullptr),
-      table_(table), read_options_(read_options), did_io_(did_io) {}
+        table_(table),
        read_options_(read_options) {}
  Iterator* NewSecondaryIterator(const Slice& index_value) override {
-    return NewDataBlockIterator(table_->rep_, read_options_, did_io_,
+    return NewDataBlockIterator(table_->rep_, read_options_, index_value);
                                index_value);
  }
  bool PrefixMayMatch(const Slice& internal_key) override {
@ -936,8 +931,6 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
  // Don't own table_
  BlockBasedTable* table_;
  const ReadOptions read_options_;
  // Don't own did_io_
  bool* did_io_;
 };
 // This will be broken if the user specifies an unusual implementation
@ -1021,15 +1014,14 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
 Iterator* BlockBasedTable::NewIterator(const ReadOptions& read_options,
                                       Arena* arena) {
-  return NewTwoLevelIterator(
+  return NewTwoLevelIterator(new BlockEntryIteratorState(this, read_options),
-      new BlockEntryIteratorState(this, read_options, nullptr),
+                             NewIndexIterator(read_options), arena);
      NewIndexIterator(read_options), arena);
 }
 Status BlockBasedTable::Get(
    const ReadOptions& read_options, const Slice& key, void* handle_context,
    bool (*result_handler)(void* handle_context, const ParsedInternalKey& k,
-                           const Slice& v, bool didIO),
+                           const Slice& v),
    void (*mark_key_may_exist_handler)(void* handle_context)) {
  Status s;
  Iterator* iiter = NewIndexIterator(read_options);
@ -1052,9 +1044,8 @@ Status BlockBasedTable::Get(
      RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL);
      break;
    } else {
      bool didIO = false;
      unique_ptr<Iterator> block_iter(
-          NewDataBlockIterator(rep_, read_options, &didIO, iiter->value()));
+          NewDataBlockIterator(rep_, read_options, iiter->value()));
      if (read_options.read_tier && block_iter->status().IsIncomplete()) {
        // couldn't get block from block_cache
@ -1071,8 +1062,8 @@ Status BlockBasedTable::Get(
          s = Status::Corruption(Slice());
        }
-        if (!(*result_handler)(handle_context, parsed_key, block_iter->value(),
+        if (!(*result_handler)(handle_context, parsed_key,
-                               didIO)) {
+                               block_iter->value())) {
          done = true;
          break;
        }
@ -1089,22 +1080,34 @@ Status BlockBasedTable::Get(
  return s;
 }
 namespace {
 bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value,
               bool didIO) {
  *reinterpret_cast<bool*>(arg) = didIO;
  return false;
 }
 }  // namespace
 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                      const Slice& key) {
-  // We use Get() as it has logic that checks whether we read the
+  std::unique_ptr<Iterator> iiter(NewIndexIterator(options));
-  // block from the disk or not.
+  iiter->Seek(key);
-  bool didIO = false;
+  assert(iiter->Valid());
-  Status s = Get(options, key, &didIO, SaveDidIO);
+  CachableEntry<Block> block;
  BlockHandle handle;
  Slice input = iiter->value();
  Status s = handle.DecodeFrom(&input);
  assert(s.ok());
-  return !didIO;
+  Cache* block_cache = rep_->options.block_cache.get();
  assert(block_cache != nullptr);
  char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  Slice cache_key =
      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
                  cache_key_storage);
  Slice ckey;
  s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, nullptr,
                            options, &block);
  assert(s.ok());
  bool in_cache = block.value != nullptr;
  if (in_cache) {
    ReleaseCachedEntry(block_cache, block.cache_handle);
  }
  return in_cache;
 }
 // REQUIRES: The following fields of rep_ should have already been populated:
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@ -73,8 +73,7 @@ class BlockBasedTable : public TableReader {
  Status Get(const ReadOptions& readOptions, const Slice& key,
             void* handle_context,
             bool (*result_handler)(void* handle_context,
-                                    const ParsedInternalKey& k, const Slice& v,
+                                    const ParsedInternalKey& k, const Slice& v),
                                    bool didIO),
             void (*mark_key_may_exist_handler)(void* handle_context) =
                 nullptr) override;
@ -87,7 +86,7 @@ class BlockBasedTable : public TableReader {
  uint64_t ApproximateOffsetOf(const Slice& key) override;
  // Returns true if the block for the specified key is in cache.
-  // REQUIRES: key is in this table.
+  // REQUIRES: key is in this table && block cache enabled
  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
  // Set up the table for Compaction. Might change some parameters with
@ -113,7 +112,7 @@ class BlockBasedTable : public TableReader {
  class BlockEntryIteratorState;
  static Iterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro,
-      bool* didIO, const Slice& index_value);
+                                        const Slice& index_value);
  // For the following two functions:
  // if `no_io == true`, we will not try to read filter/index from sst file
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@ -643,7 +643,7 @@ void PlainTableReader::Prepare(const Slice& target) {
 Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
                             void* arg,
                             bool (*saver)(void*, const ParsedInternalKey&,
-                                           const Slice&, bool),
+                                           const Slice&),
                             void (*mark_key_may_exist)(void*)) {
  // Check bloom filter first.
  Slice prefix_slice;
@ -699,7 +699,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
      prefix_match = true;
    }
    if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
-      if (!(*saver)(arg, found_key, found_value, true)) {
+      if (!(*saver)(arg, found_key, found_value)) {
        break;
      }
    }
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@ -63,7 +63,7 @@ class PlainTableReader: public TableReader {
  Status Get(const ReadOptions&, const Slice& key, void* arg,
             bool (*result_handler)(void* arg, const ParsedInternalKey& k,
-                                    const Slice& v, bool),
+                                    const Slice& v),
             void (*mark_key_may_exist)(void*) = nullptr);
  uint64_t ApproximateOffsetOf(const Slice& key);
--- a/table/table_reader.h
+++ b/table/table_reader.h
@ -55,8 +55,8 @@ class TableReader {
  // Calls (*result_handler)(handle_context, ...) repeatedly, starting with
  // the entry found after a call to Seek(key), until result_handler returns
  // false, where k is the actual internal key for a row found and v as the
-  // value of the key. didIO is true if I/O is involved in the operation. May
+  // value of the key. May not make such a call if filter policy says that key
-  // not make such a call if filter policy says that key is not present.
+  // is not present.
  //
  // mark_key_may_exist_handler needs to be called when it is configured to be
  // memory only and the key is not found in the block cache, with
@ -67,7 +67,7 @@ class TableReader {
  virtual Status Get(
      const ReadOptions& readOptions, const Slice& key, void* handle_context,
      bool (*result_handler)(void* arg, const ParsedInternalKey& k,
-                             const Slice& v, bool didIO),
+                             const Slice& v),
      void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
 };
--- a/table/table_test.cc
+++ b/table/table_test.cc
@ -1481,6 +1481,14 @@ TEST(BlockBasedTableTest, BlockCacheLeak) {
  for (const std::string& key : keys) {
    ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
  }
  // rerun with different block cache
  opt.block_cache = NewLRUCache(16 * 1024 * 1024);
  ASSERT_OK(c.Reopen(opt));
  table_reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
  for (const std::string& key : keys) {
    ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key));
  }
 }
 TEST(PlainTableTest, BasicPlainTableProperties) {
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@ -81,7 +81,6 @@ def main(argv):
            --iterpercent=10
            --db=%s
            --max_key=100000000
            --disable_seek_compaction=%s
            --mmap_read=%s
            --block_size=16384
            --cache_size=1048576
@ -104,7 +103,6 @@ def main(argv):
                   write_buf_size,
                   dbname,
                   random.randint(0, 1),
                   random.randint(0, 1),
                   random.randint(0, 1)))
        child = subprocess.Popen([cmd],
--- a/tools/db_crashtest2.py
+++ b/tools/db_crashtest2.py
@ -100,7 +100,6 @@ def main(argv):
            --iterpercent=10
            --db=%s
            --max_key=100000000
            --disable_seek_compaction=%s
            --mmap_read=%s
            --block_size=16384
            --cache_size=1048576
@ -125,7 +124,6 @@ def main(argv):
                   dbname,
                   random.randint(0, 1),
                   random.randint(0, 1),
                   random.randint(0, 1),
                   additional_opts))
        print "Running:" + cmd + "\n"
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@ -286,9 +286,6 @@ DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
 static const bool FLAGS_num_iterations_dummy __attribute__((unused)) =
    RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
 DEFINE_bool(disable_seek_compaction, false,
            "Option to disable compation triggered by read.");
 namespace {
 enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
  assert(ctype);
@ -1585,7 +1582,6 @@ class StressTest {
        FLAGS_level0_file_num_compaction_trigger;
    options_.compression = FLAGS_compression_type_e;
    options_.create_if_missing = true;
    options_.disable_seek_compaction = FLAGS_disable_seek_compaction;
    options_.max_manifest_file_size = 10 * 1024;
    options_.filter_deletes = FLAGS_filter_deletes;
    if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) {
--- a/util/options.cc
+++ b/util/options.cc
@ -382,8 +382,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
        source_compaction_factor);
    Log(log,"         Options.max_grandparent_overlap_factor: %d",
        max_grandparent_overlap_factor);
    Log(log,"                Options.disable_seek_compaction: %d",
        disable_seek_compaction);
    Log(log,"                         Options.no_block_cache: %d",
        no_block_cache);
    Log(log,"                       Options.arena_block_size: %zu",
@ -466,7 +464,6 @@ Options::PrepareForBulkLoad()
  // no auto compactions please. The application should issue a
  // manual compaction after all data is loaded into L0.
  disable_auto_compactions = true;
  disable_seek_compaction = true;
  disableDataSync = true;
  // A manual compaction run should pick all files in L0 in