From d4a8423334092e9f1e5aff2f911f02d5ca599b5e Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Fri, 20 Jun 2014 10:23:02 +0200 Subject: [PATCH] Remove seek compaction Summary: As discussed in our internal group, we don't get much use of seek compaction at the moment, while it's making code more complicated and slower in some cases. This diff removes seek compaction and (hopefully) all code that was introduced to support seek compaction. There is one test case that relied on didIO information. I'll try to find another way to implement it. Test Plan: make check Reviewers: sdong, haobo, yhchiang, ljin, dhruba Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D19161 --- HISTORY.md | 9 ++++ build_tools/regression_build_test.sh | 8 --- db/column_family_test.cc | 41 --------------- db/compaction_picker.cc | 25 --------- db/db_bench.cc | 7 ++- db/db_impl.cc | 32 ++--------- db/db_impl_readonly.cc | 4 +- db/db_test.cc | 64 ---------------------- db/simple_table_db_test.cc | 6 +-- db/table_cache.cc | 17 +++--- db/table_cache.h | 6 +-- db/version_edit.h | 2 - db/version_set.cc | 71 ++----------------------- db/version_set.h | 18 +------ doc/index.html | 4 -- include/rocksdb/options.h | 13 +++-- table/block_based_table_reader.cc | 79 +++++++++++++++------------- table/block_based_table_reader.h | 7 ++- table/plain_table_reader.cc | 4 +- table/plain_table_reader.h | 2 +- table/table_reader.h | 6 +-- table/table_test.cc | 8 +++ tools/db_crashtest.py | 2 - tools/db_crashtest2.py | 2 - tools/db_stress.cc | 4 -- util/options.cc | 3 -- 26 files changed, 101 insertions(+), 343 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index d0030cc4c..12f6b83dc 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,14 @@ # Rocksdb Change Log +## Unreleased (will be released with 3.2.0) + +### Public API changes +* We removed seek compaction as a concept from RocksDB because: +1) It makes more sense for spinning disk workloads, while RocksDB is primarily designed for flash and memory, +2) It added some complexity to the important code-paths, +3) None of our internal customers were really using it. +Because of that, Options::disable_seek_compaction is now obsolete. It is still a parameter in Options, so it does not break the build, but it does not have any effect. We plan to completely remove it at some point, so we ask users to please remove this option from your code base. + ## 3.1.0 (05/21/2014) ### Public API changes diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh index e2b19a3cb..5e335afde 100755 --- a/build_tools/regression_build_test.sh +++ b/build_tools/regression_build_test.sh @@ -109,7 +109,6 @@ make release --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --open_files=55000 \ - --disable_seek_compaction=1 \ --statistics=1 \ --histogram=1 \ --disable_data_sync=1 \ @@ -129,7 +128,6 @@ make release --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --open_files=55000 \ - --disable_seek_compaction=1 \ --use_tailing_iterator=1 \ --statistics=1 \ --histogram=1 \ @@ -150,7 +148,6 @@ make release --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --open_files=55000 \ - --disable_seek_compaction=1 \ --statistics=1 \ --histogram=1 \ --disable_data_sync=1 \ @@ -172,7 +169,6 @@ make release --table_cache_numshardbits=4 \ --write_buffer_size=1000000000 \ --open_files=55000 \ - --disable_seek_compaction=1 \ --statistics=1 \ --histogram=1 \ --disable_data_sync=1 \ @@ -231,7 +227,6 @@ make release --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --open_files=55000 \ - --disable_seek_compaction=1 \ --disable_auto_compactions=1 \ --statistics=1 \ --histogram=1 \ @@ -254,7 +249,6 @@ make release --cache_numshardbits=6 \ --table_cache_numshardbits=4 \ --open_files=55000 \ - --disable_seek_compaction=1 \ --statistics=1 \ --histogram=1 \ --disable_data_sync=1 \ @@ -274,7 +268,6 @@ make release --table_cache_numshardbits=4 \ --write_buffer_size=1000000000 \ --open_files=55000 \ - --disable_seek_compaction=1 \ --statistics=1 \ --histogram=1 \ --disable_data_sync=1 \ @@ -291,7 +284,6 @@ common_in_mem_args="--db=/dev/shm/rocksdb \ --value_size=100 \ --compression_type=none \ --compression_ratio=1 \ - --disable_seek_compaction=1 \ --hard_rate_limit=2 \ --write_buffer_size=134217728 \ --max_write_buffer_number=4 \ diff --git a/db/column_family_test.cc b/db/column_family_test.cc index d8183759e..0dc9ccd09 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -749,7 +749,6 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) { default_cf.filter_policy = nullptr; default_cf.no_block_cache = true; default_cf.source_compaction_factor = 100; - default_cf.disable_seek_compaction = false; one.compaction_style = kCompactionStyleUniversal; // trigger compaction if there are >= 4 files @@ -764,33 +763,6 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) { Reopen({default_cf, one, two}); - // SETUP column family "default" - test read compaction - ASSERT_EQ("", FilesPerLevel(0)); - PutRandomData(0, 1, 4096); - ASSERT_OK(Flush(0)); - ASSERT_EQ("0,0,1", FilesPerLevel(0)); - // write 8MB - PutRandomData(0, 2000, 4096); - ASSERT_OK(Flush(0)); - // clear levels 0 and 1 - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[0]); - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[0]); - ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); - ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); - // write some new keys into level 0 and 1 - PutRandomData(0, 1024, 512); - ASSERT_OK(Flush(0)); - WaitForCompaction(); - PutRandomData(0, 10, 512); - ASSERT_OK(Flush(0)); - // remember number of files in each level - int l1 = NumTableFilesAtLevel(0, 0); - int l2 = NumTableFilesAtLevel(1, 0); - int l3 = NumTableFilesAtLevel(2, 0); - ASSERT_NE(l1, 0); - ASSERT_NE(l2, 0); - ASSERT_NE(l3, 0); - // SETUP column family "one" -- universal style for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) { PutRandomData(1, 11, 10000); @@ -805,12 +777,6 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) { ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2)); } - // TRIGGER compaction "default" - // read a bunch of times, trigger read compaction - for (int i = 0; i < 200000; ++i) { - Get(0, std::to_string(i)); - } - // TRIGGER compaction "one" PutRandomData(1, 12, 10000); @@ -820,13 +786,6 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) { // WAIT for compactions WaitForCompaction(); - // VERIFY compaction "default" - // verify that the number of files have decreased - // in some level, indicating that there was a compaction - ASSERT_TRUE(NumTableFilesAtLevel(0, 0) < l1 || - NumTableFilesAtLevel(1, 0) < l2 || - NumTableFilesAtLevel(2, 0) < l3); - // VERIFY compaction "one" ASSERT_EQ("1", FilesPerLevel(1)); diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index 2ec42b49a..b6e407e63 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -407,31 +407,6 @@ Compaction* LevelCompactionPicker::PickCompaction(Version* version, } } - // Find compactions needed by seeks - FileMetaData* f = version->file_to_compact_; - if (c == nullptr && f != nullptr && !f->being_compacted) { - - level = version->file_to_compact_level_; - int parent_index = -1; - - // Only allow one level 0 compaction at a time. - // Do not pick this file if its parents at level+1 are being compacted. - if (level != 0 || compactions_in_progress_[0].empty()) { - if (!ParentRangeInCompaction(version, &f->smallest, &f->largest, level, - &parent_index)) { - c = new Compaction(version, level, level + 1, - MaxFileSizeForLevel(level + 1), - MaxGrandParentOverlapBytes(level), true); - c->inputs_[0].push_back(f); - c->parent_index_ = parent_index; - c->input_version_->file_to_compact_ = nullptr; - if (ExpandWhileOverlapping(c) == false) { - return nullptr; - } - } - } - } - if (c == nullptr) { return nullptr; } diff --git a/db/db_bench.cc b/db/db_bench.cc index b8e4b3213..6e61ce0a8 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -333,9 +333,6 @@ DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/" "deletepercent), so deletepercent must be smaller than (100 - " "FLAGS_readwritepercent)"); -DEFINE_int32(disable_seek_compaction, false, "Option to disable compaction" - " triggered by read."); - DEFINE_uint64(delete_obsolete_files_period_micros, 0, "Option to delete " "obsolete files periodically. 0 means that obsolete files are" " deleted after every compaction run."); @@ -553,6 +550,9 @@ static const bool FLAGS_cache_numshardbits_dummy __attribute__((unused)) = static const bool FLAGS_readwritepercent_dummy __attribute__((unused)) = RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent); +DEFINE_int32(disable_seek_compaction, false, + "Not used, left here for backwards compatibility"); + static const bool FLAGS_deletepercent_dummy __attribute__((unused)) = RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent); static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) = @@ -1663,7 +1663,6 @@ class Benchmark { options.compression_per_level[i] = FLAGS_compression_type_e; } } - options.disable_seek_compaction = FLAGS_disable_seek_compaction; options.delete_obsolete_files_period_micros = FLAGS_delete_obsolete_files_period_micros; options.soft_rate_limit = FLAGS_soft_rate_limit; diff --git a/db/db_impl.cc b/db/db_impl.cc index 290b67f16..0fb8271bc 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -3276,9 +3276,6 @@ Status DBImpl::GetImpl(const ReadOptions& options, mutex_.Unlock(); } - bool have_stat_update = false; - Version::GetStats stats; - // Prepare to store a list of merge operations if merge occurs. MergeContext merge_context; @@ -3297,23 +3294,13 @@ Status DBImpl::GetImpl(const ReadOptions& options, } else { PERF_TIMER_START(get_from_output_files_time); - sv->current->Get(options, lkey, value, &s, &merge_context, &stats, - value_found); - have_stat_update = true; + sv->current->Get(options, lkey, value, &s, &merge_context, value_found); PERF_TIMER_STOP(get_from_output_files_time); RecordTick(options_.statistics.get(), MEMTABLE_MISS); } PERF_TIMER_START(get_post_process_time); - if (!cfd->options()->disable_seek_compaction && have_stat_update) { - mutex_.Lock(); - if (sv->current->UpdateStats(stats)) { - MaybeScheduleFlushOrCompaction(); - } - mutex_.Unlock(); - } - bool unref_sv = true; if (LIKELY(options_.allow_thread_local)) { unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv); @@ -3350,8 +3337,6 @@ std::vector DBImpl::MultiGet( struct MultiGetColumnFamilyData { ColumnFamilyData* cfd; SuperVersion* super_version; - Version::GetStats stats; - bool have_stat_update = false; }; std::unordered_map multiget_cf_data; // fill up and allocate outside of mutex @@ -3412,9 +3397,7 @@ std::vector DBImpl::MultiGet( *cfd->options())) { // Done } else { - super_version->current->Get(options, lkey, value, &s, &merge_context, - &mgd->stats); - mgd->have_stat_update = true; + super_version->current->Get(options, lkey, value, &s, &merge_context); } if (s.ok()) { @@ -3426,24 +3409,15 @@ std::vector DBImpl::MultiGet( PERF_TIMER_START(get_post_process_time); autovector superversions_to_delete; - bool schedule_flush_or_compaction = false; + // TODO(icanadi) do we need lock here or just around Cleanup()? mutex_.Lock(); for (auto mgd_iter : multiget_cf_data) { auto mgd = mgd_iter.second; - auto cfd = mgd->cfd; - if (!cfd->options()->disable_seek_compaction && mgd->have_stat_update) { - if (mgd->super_version->current->UpdateStats(mgd->stats)) { - schedule_flush_or_compaction = true; - } - } if (mgd->super_version->Unref()) { mgd->super_version->Cleanup(); superversions_to_delete.push_back(mgd->super_version); } } - if (schedule_flush_or_compaction) { - MaybeScheduleFlushOrCompaction(); - } mutex_.Unlock(); for (auto td : superversions_to_delete) { diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index 43083746d..b1324d072 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -65,9 +65,7 @@ Status DBImplReadOnly::Get(const ReadOptions& options, if (super_version->mem->Get(lkey, value, &s, merge_context, *cfd->options())) { } else { - Version::GetStats stats; - super_version->current->Get(options, lkey, value, &s, &merge_context, - &stats); + super_version->current->Get(options, lkey, value, &s, &merge_context); } return s; } diff --git a/db/db_test.cc b/db/db_test.cc index c998cca87..ab559b53a 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -5869,69 +5869,6 @@ TEST(DBTest, ReadFirstRecordCache) { ASSERT_EQ(env_->sequential_read_counter_.Read(), 1); } -TEST(DBTest, ReadCompaction) { - std::string value(4096, '4'); // a string of size 4K - { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.max_open_files = 20; // only 10 file in file-cache - options.target_file_size_base = 512; - options.write_buffer_size = 64 * 1024; - options.filter_policy = nullptr; - options.block_size = 4096; - options.no_block_cache = true; - options.disable_seek_compaction = false; - - CreateAndReopenWithCF({"pikachu"}, &options); - - // Write 8MB (2000 values, each 4K) - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - std::vector values; - for (int i = 0; i < 2000; i++) { - ASSERT_OK(Put(1, Key(i), value)); - } - - // clear level 0 and 1 if necessary. - Flush(1); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); - - // write some new keys into level 0 - for (int i = 0; i < 2000; i = i + 16) { - ASSERT_OK(Put(1, Key(i), value)); - } - Flush(1); - - // Wait for any write compaction to finish - dbfull()->TEST_WaitForCompact(); - - // remember number of files in each level - int l1 = NumTableFilesAtLevel(0, 1); - int l2 = NumTableFilesAtLevel(1, 1); - int l3 = NumTableFilesAtLevel(2, 1); - ASSERT_NE(NumTableFilesAtLevel(0, 1), 0); - ASSERT_NE(NumTableFilesAtLevel(1, 1), 0); - ASSERT_NE(NumTableFilesAtLevel(2, 1), 0); - - // read a bunch of times, trigger read compaction - for (int j = 0; j < 100; j++) { - for (int i = 0; i < 2000; i++) { - Get(1, Key(i)); - } - } - // wait for read compaction to finish - env_->SleepForMicroseconds(1000000); - - // verify that the number of files have decreased - // in some level, indicating that there was a compaction - ASSERT_TRUE(NumTableFilesAtLevel(0, 1) < l1 || - NumTableFilesAtLevel(1, 1) < l2 || - NumTableFilesAtLevel(2, 1) < l3); - } -} - // Multi-threaded test: namespace { @@ -6641,7 +6578,6 @@ TEST(DBTest, PrefixScan) { options.disable_auto_compactions = true; options.max_background_compactions = 2; options.create_if_missing = true; - options.disable_seek_compaction = true; options.memtable_factory.reset(NewHashSkipListRepFactory()); // 11 RAND I/Os diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc index a86ff0a17..1d616c226 100644 --- a/db/simple_table_db_test.cc +++ b/db/simple_table_db_test.cc @@ -87,7 +87,7 @@ public: Status Get(const ReadOptions&, const Slice& key, void* arg, bool (*handle_result)(void* arg, const ParsedInternalKey& k, - const Slice& v, bool), + const Slice& v), void (*mark_key_may_exist)(void*) = nullptr) override; uint64_t ApproximateOffsetOf(const Slice& key) override; @@ -285,7 +285,7 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) { Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k, void* arg, bool (*saver)(void*, const ParsedInternalKey&, - const Slice&, bool), + const Slice&), void (*mark_key_may_exist)(void*)) { Status s; SimpleTableIterator* iter = new SimpleTableIterator(this); @@ -295,7 +295,7 @@ Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k, return Status::Corruption(Slice()); } - if (!(*saver)(arg, parsed_key, iter->value(), true)) { + if (!(*saver)(arg, parsed_key, iter->value())) { break; } } diff --git a/db/table_cache.cc b/db/table_cache.cc index 9993e5a08..7a7513026 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -58,7 +58,7 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) { Status TableCache::FindTable(const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, Cache::Handle** handle, - bool* table_io, const bool no_io) { + const bool no_io) { Status s; Slice key = GetSliceForFileNumber(&fd.number); *handle = cache_->Lookup(key); @@ -66,9 +66,6 @@ Status TableCache::FindTable(const EnvOptions& toptions, if (no_io) { // Dont do IO and return a not-found status return Status::Incomplete("Table not found in table_cache, no_io is set"); } - if (table_io != nullptr) { - *table_io = true; // we had to do IO from storage - } std::string fname = TableFileName(dbname_, fd.GetNumber()); unique_ptr file; unique_ptr table_reader; @@ -110,7 +107,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, Cache::Handle* handle = nullptr; Status s; if (table_reader == nullptr) { - s = FindTable(toptions, icomparator, fd, &handle, nullptr, + s = FindTable(toptions, icomparator, fd, &handle, options.read_tier == kBlockCacheTier); if (!s.ok()) { return NewErrorIterator(s, arena); @@ -137,13 +134,13 @@ Status TableCache::Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, const Slice& k, void* arg, bool (*saver)(void*, const ParsedInternalKey&, - const Slice&, bool), - bool* table_io, void (*mark_key_may_exist)(void*)) { + const Slice&), + void (*mark_key_may_exist)(void*)) { TableReader* t = fd.table_reader; Status s; Cache::Handle* handle = nullptr; if (!t) { - s = FindTable(storage_options_, internal_comparator, fd, &handle, table_io, + s = FindTable(storage_options_, internal_comparator, fd, &handle, options.read_tier == kBlockCacheTier); if (s.ok()) { t = GetTableReaderFromHandle(handle); @@ -174,10 +171,8 @@ Status TableCache::GetTableProperties( return s; } - bool table_io; Cache::Handle* table_handle = nullptr; - s = FindTable(toptions, internal_comparator, fd, &table_handle, &table_io, - no_io); + s = FindTable(toptions, internal_comparator, fd, &table_handle, no_io); if (!s.ok()) { return s; } diff --git a/db/table_cache.h b/db/table_cache.h index 4311a390d..e912addc1 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -52,8 +52,8 @@ class TableCache { const InternalKeyComparator& internal_comparator, const FileDescriptor& file_fd, const Slice& k, void* arg, bool (*handle_result)(void*, const ParsedInternalKey&, - const Slice&, bool), - bool* table_io, void (*mark_key_may_exist)(void*) = nullptr); + const Slice&), + void (*mark_key_may_exist)(void*) = nullptr); // Evict any entry for the specified file number static void Evict(Cache* cache, uint64_t file_number); @@ -62,7 +62,7 @@ class TableCache { Status FindTable(const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, const FileDescriptor& file_fd, Cache::Handle**, - bool* table_io = nullptr, const bool no_io = false); + const bool no_io = false); // Get TableReader from a cache handle. TableReader* GetTableReaderFromHandle(Cache::Handle* handle); diff --git a/db/version_edit.h b/db/version_edit.h index 7d4d28b03..1d214a149 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -40,7 +40,6 @@ struct FileDescriptor { struct FileMetaData { int refs; FileDescriptor fd; - int allowed_seeks; // Seeks allowed until compaction InternalKey smallest; // Smallest internal key served by table InternalKey largest; // Largest internal key served by table bool being_compacted; // Is this file undergoing compaction? @@ -53,7 +52,6 @@ struct FileMetaData { FileMetaData() : refs(0), fd(0, 0), - allowed_seeks(1 << 30), being_compacted(false), table_reader_handle(nullptr) {} }; diff --git a/db/version_set.cc b/db/version_set.cc index 831e8941a..10b25533c 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -362,7 +362,6 @@ struct Saver { // the merge operations encountered; MergeContext* merge_context; Logger* logger; - bool didIO; // did we do any disk io? Statistics* statistics; }; } @@ -381,15 +380,14 @@ static void MarkKeyMayExist(void* arg) { } static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, - const Slice& v, bool didIO) { + const Slice& v) { Saver* s = reinterpret_cast(arg); MergeContext* merge_contex = s->merge_context; std::string merge_result; // temporary area for merge results later assert(s != nullptr && merge_contex != nullptr); - // TODO: didIO and Merge? - s->didIO = didIO; + // TODO: Merge? if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) { // Key matches. Process it switch (parsed_key.type) { @@ -490,8 +488,6 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset, files_(new std::vector[num_levels_]), files_by_size_(num_levels_), next_file_to_compact_by_size_(num_levels_), - file_to_compact_(nullptr), - file_to_compact_level_(-1), compaction_score_(num_levels_), compaction_level_(num_levels_), version_number_(version_number), @@ -504,7 +500,6 @@ void Version::Get(const ReadOptions& options, std::string* value, Status* status, MergeContext* merge_context, - GetStats* stats, bool* value_found) { Slice ikey = k.internal_key(); Slice user_key = k.user_key(); @@ -519,14 +514,8 @@ void Version::Get(const ReadOptions& options, saver.merge_operator = merge_operator_; saver.merge_context = merge_context; saver.logger = info_log_; - saver.didIO = false; saver.statistics = db_statistics_; - stats->seek_file = nullptr; - stats->seek_file_level = -1; - FileMetaData* last_file_read = nullptr; - int last_file_read_level = -1; - // We can search level-by-level since entries never hop across // levels. Therefore we are guaranteed that if we find data // in an smaller level, later levels are irrelevant (unless we @@ -657,32 +646,13 @@ void Version::Get(const ReadOptions& options, } prev_file = f; #endif - bool tableIO = false; *status = table_cache_->Get(options, *internal_comparator_, f->fd, ikey, - &saver, SaveValue, &tableIO, MarkKeyMayExist); + &saver, SaveValue, MarkKeyMayExist); // TODO: examine the behavior for corrupted key if (!status->ok()) { return; } - if (last_file_read != nullptr && stats->seek_file == nullptr) { - // We have had more than one seek for this read. Charge the 1st file. - stats->seek_file = last_file_read; - stats->seek_file_level = last_file_read_level; - } - - // If we did any IO as part of the read, then we remember it because - // it is a possible candidate for seek-based compaction. saver.didIO - // is true if the block had to be read in from storage and was not - // pre-exisiting in the block cache. Also, if this file was not pre- - // existing in the table cache and had to be freshly opened that needed - // the index blocks to be read-in, then tableIO is true. One thing - // to note is that the index blocks are not part of the block cache. - if (saver.didIO || tableIO) { - last_file_read = f; - last_file_read_level = level; - } - switch (saver.state) { case kNotFound: break; // Keep searching in other files @@ -723,19 +693,6 @@ void Version::Get(const ReadOptions& options, } } -bool Version::UpdateStats(const GetStats& stats) { - FileMetaData* f = stats.seek_file; - if (f != nullptr) { - f->allowed_seeks--; - if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) { - file_to_compact_ = f; - file_to_compact_level_ = stats.seek_file_level; - return true; - } - } - return false; -} - void Version::PrepareApply(std::vector& size_being_compacted) { ComputeCompactionScore(size_being_compacted); UpdateFilesBySize(); @@ -917,9 +874,6 @@ bool Version::Unref() { } bool Version::NeedsCompaction() const { - if (file_to_compact_ != nullptr) { - return true; - } // In universal compaction case, this check doesn't really // check the compaction condition, but checks num of files threshold // only. We are not going to miss any compaction opportunity @@ -1477,22 +1431,6 @@ class VersionSet::Builder { FileMetaData* f = new FileMetaData(new_file.second); f->refs = 1; - // We arrange to automatically compact this file after - // a certain number of seeks. Let's assume: - // (1) One seek costs 10ms - // (2) Writing or reading 1MB costs 10ms (100MB/s) - // (3) A compaction of 1MB does 25MB of IO: - // 1MB read from this level - // 10-12MB read from next level (boundaries may be misaligned) - // 10-12MB written to next level - // This implies that 25 seeks cost the same as the compaction - // of 1MB of data. I.e., one seek costs approximately the - // same as the compaction of 40KB of data. We are a little - // conservative and allow approximately one seek for every 16KB - // of data before triggering a compaction. - f->allowed_seeks = (f->fd.GetFileSize() / 16384); - if (f->allowed_seeks < 100) f->allowed_seeks = 100; - levels_[level].deleted_files.erase(f->fd.GetNumber()); levels_[level].added_files->insert(f); } @@ -1539,10 +1477,9 @@ class VersionSet::Builder { for (int level = 0; level < cfd_->NumberLevels(); level++) { for (auto& file_meta : *(levels_[level].added_files)) { assert (!file_meta->table_reader_handle); - bool table_io; cfd_->table_cache()->FindTable( base_->vset_->storage_options_, cfd_->internal_comparator(), - file_meta->fd, &file_meta->table_reader_handle, &table_io, false); + file_meta->fd, &file_meta->table_reader_handle, false); if (file_meta->table_reader_handle != nullptr) { // Load table_reader file_meta->fd.table_reader = diff --git a/db/version_set.h b/db/version_set.h index 07dca8b9d..cf526c2bd 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -85,23 +85,13 @@ class Version { MergeIteratorBuilder* merger_iter_builder); // Lookup the value for key. If found, store it in *val and - // return OK. Else return a non-OK status. Fills *stats. + // return OK. Else return a non-OK status. // Uses *operands to store merge_operator operations to apply later // REQUIRES: lock is not held - struct GetStats { - FileMetaData* seek_file; - int seek_file_level; - }; - void Get(const ReadOptions&, const LookupKey& key, std::string* val, - Status* status, MergeContext* merge_context, GetStats* stats, + Status* status, MergeContext* merge_context, bool* value_found = nullptr); - // Adds "stats" into the current state. Returns true if a new - // compaction may need to be triggered, false otherwise. - // REQUIRES: lock is held - bool UpdateStats(const GetStats& stats); - // Updates internal structures that keep track of compaction scores // We use compaction scores to figure out which compaction to do next // REQUIRES: If Version is not yet saved to current_, it can be called without @@ -278,10 +268,6 @@ class Version { // seconds/minutes (because of concurrent compactions). static const int number_of_files_to_sort_ = 50; - // Next file to compact based on seek stats. - FileMetaData* file_to_compact_; - int file_to_compact_level_; - // Level that should be compacted next and its compaction score. // Score < 1 means compaction is not strictly needed. These fields // are initialized by Finalize(). diff --git a/doc/index.html b/doc/index.html index 71f515e76..94f7cb888 100644 --- a/doc/index.html +++ b/doc/index.html @@ -642,10 +642,6 @@ Default:1, i.e. pick maxfilesize amount of data as the source of a compaction.
  • Options::max_grandparent_overlap_factor - Control maximum bytes of overlaps in grandparent (i.e., level+2) before we stop building a single file in a level->level+1 compaction.

    -

  • Options::disable_seek_compaction - Disable compaction triggered by seek. -With bloomfilter and fast storage, a miss on one level is very cheap if the file handle is cached in table cache -(which is true if max_open_files is large). -

  • Options::max_background_compactions - Maximum number of concurrent background jobs, submitted to the default LOW priority thread pool diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index d32e95aef..658be3afc 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -368,10 +368,15 @@ struct ColumnFamilyOptions { // stop building a single file in a level->level+1 compaction. int max_grandparent_overlap_factor; - // Disable compaction triggered by seek. - // With bloomfilter and fast storage, a miss on one level - // is very cheap if the file handle is cached in table cache - // (which is true if max_open_files is large). + // We decided to remove seek compaction from RocksDB because: + // 1) It makes more sense for spinning disk workloads, while RocksDB is + // primarily designed for flash and memory, + // 2) It added some complexity to the important code-paths, + // 3) None of our internal customers were really using it. + // + // Since we removed seek compaction, this option is now obsolete. + // We left it here for backwards compatiblity (otherwise it would break the + // build), but we'll remove it at some point. // Default: true bool disable_seek_compaction; diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 25de3ca63..81e8a711e 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -56,13 +56,11 @@ const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) = // Read the block identified by "handle" from "file". // The only relevant option is options.verify_checksums for now. -// Set *didIO to true if didIO is not null. // On failure return non-OK. // On success fill *result and return OK - caller owns *result Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, - Block** result, Env* env, bool* didIO = nullptr, - bool do_uncompress = true) { + Block** result, Env* env, bool do_uncompress = true) { BlockContents contents; Status s = ReadBlockContents(file, footer, options, handle, &contents, env, do_uncompress); @@ -70,9 +68,6 @@ Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer, *result = new Block(contents); } - if (didIO != nullptr) { - *didIO = true; - } return s; } @@ -834,7 +829,7 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) { // Convert an index iterator value (i.e., an encoded BlockHandle) // into an iterator over the contents of the corresponding block. Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, - const ReadOptions& ro, bool* didIO, const Slice& index_value) { + const ReadOptions& ro, const Slice& index_value) { const bool no_io = (ro.read_tier == kBlockCacheTier); Cache* block_cache = rep->options.block_cache.get(); Cache* block_cache_compressed = rep->options. @@ -872,7 +867,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, } s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, - statistics, ro, &block); + statistics, ro, &block); if (block.value == nullptr && !no_io && ro.fill_cache) { Histograms histogram = READ_BLOCK_GET_MICROS; @@ -880,7 +875,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, { StopWatch sw(rep->options.env, statistics, histogram); s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, - &raw_block, rep->options.env, didIO, + &raw_block, rep->options.env, block_cache_compressed == nullptr); } @@ -898,7 +893,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, return NewErrorIterator(Status::Incomplete("no blocking io")); } s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, - &block.value, rep->options.env, didIO); + &block.value, rep->options.env); } Iterator* iter; @@ -919,13 +914,13 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { public: BlockEntryIteratorState(BlockBasedTable* table, - const ReadOptions& read_options, bool* did_io) - : TwoLevelIteratorState(table->rep_->options.prefix_extractor != nullptr), - table_(table), read_options_(read_options), did_io_(did_io) {} + const ReadOptions& read_options) + : TwoLevelIteratorState(table->rep_->options.prefix_extractor != nullptr), + table_(table), + read_options_(read_options) {} Iterator* NewSecondaryIterator(const Slice& index_value) override { - return NewDataBlockIterator(table_->rep_, read_options_, did_io_, - index_value); + return NewDataBlockIterator(table_->rep_, read_options_, index_value); } bool PrefixMayMatch(const Slice& internal_key) override { @@ -936,8 +931,6 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { // Don't own table_ BlockBasedTable* table_; const ReadOptions read_options_; - // Don't own did_io_ - bool* did_io_; }; // This will be broken if the user specifies an unusual implementation @@ -1021,15 +1014,14 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { Iterator* BlockBasedTable::NewIterator(const ReadOptions& read_options, Arena* arena) { - return NewTwoLevelIterator( - new BlockEntryIteratorState(this, read_options, nullptr), - NewIndexIterator(read_options), arena); + return NewTwoLevelIterator(new BlockEntryIteratorState(this, read_options), + NewIndexIterator(read_options), arena); } Status BlockBasedTable::Get( const ReadOptions& read_options, const Slice& key, void* handle_context, bool (*result_handler)(void* handle_context, const ParsedInternalKey& k, - const Slice& v, bool didIO), + const Slice& v), void (*mark_key_may_exist_handler)(void* handle_context)) { Status s; Iterator* iiter = NewIndexIterator(read_options); @@ -1052,9 +1044,8 @@ Status BlockBasedTable::Get( RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL); break; } else { - bool didIO = false; unique_ptr block_iter( - NewDataBlockIterator(rep_, read_options, &didIO, iiter->value())); + NewDataBlockIterator(rep_, read_options, iiter->value())); if (read_options.read_tier && block_iter->status().IsIncomplete()) { // couldn't get block from block_cache @@ -1071,8 +1062,8 @@ Status BlockBasedTable::Get( s = Status::Corruption(Slice()); } - if (!(*result_handler)(handle_context, parsed_key, block_iter->value(), - didIO)) { + if (!(*result_handler)(handle_context, parsed_key, + block_iter->value())) { done = true; break; } @@ -1089,22 +1080,34 @@ Status BlockBasedTable::Get( return s; } -namespace { -bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value, - bool didIO) { - *reinterpret_cast(arg) = didIO; - return false; -} -} // namespace - bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { - // We use Get() as it has logic that checks whether we read the - // block from the disk or not. - bool didIO = false; - Status s = Get(options, key, &didIO, SaveDidIO); + std::unique_ptr iiter(NewIndexIterator(options)); + iiter->Seek(key); + assert(iiter->Valid()); + CachableEntry block; + + BlockHandle handle; + Slice input = iiter->value(); + Status s = handle.DecodeFrom(&input); assert(s.ok()); - return !didIO; + Cache* block_cache = rep_->options.block_cache.get(); + assert(block_cache != nullptr); + + char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + Slice cache_key = + GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle, + cache_key_storage); + Slice ckey; + + s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, nullptr, + options, &block); + assert(s.ok()); + bool in_cache = block.value != nullptr; + if (in_cache) { + ReleaseCachedEntry(block_cache, block.cache_handle); + } + return in_cache; } // REQUIRES: The following fields of rep_ should have already been populated: diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index ba6a10c3e..072bedf3b 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -73,8 +73,7 @@ class BlockBasedTable : public TableReader { Status Get(const ReadOptions& readOptions, const Slice& key, void* handle_context, bool (*result_handler)(void* handle_context, - const ParsedInternalKey& k, const Slice& v, - bool didIO), + const ParsedInternalKey& k, const Slice& v), void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) override; @@ -87,7 +86,7 @@ class BlockBasedTable : public TableReader { uint64_t ApproximateOffsetOf(const Slice& key) override; // Returns true if the block for the specified key is in cache. - // REQUIRES: key is in this table. + // REQUIRES: key is in this table && block cache enabled bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); // Set up the table for Compaction. Might change some parameters with @@ -113,7 +112,7 @@ class BlockBasedTable : public TableReader { class BlockEntryIteratorState; static Iterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro, - bool* didIO, const Slice& index_value); + const Slice& index_value); // For the following two functions: // if `no_io == true`, we will not try to read filter/index from sst file diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index b3223805f..469a61cf4 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -643,7 +643,7 @@ void PlainTableReader::Prepare(const Slice& target) { Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, void* arg, bool (*saver)(void*, const ParsedInternalKey&, - const Slice&, bool), + const Slice&), void (*mark_key_may_exist)(void*)) { // Check bloom filter first. Slice prefix_slice; @@ -699,7 +699,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, prefix_match = true; } if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { - if (!(*saver)(arg, found_key, found_value, true)) { + if (!(*saver)(arg, found_key, found_value)) { break; } } diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index 8e5a3ee52..6759966c1 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -63,7 +63,7 @@ class PlainTableReader: public TableReader { Status Get(const ReadOptions&, const Slice& key, void* arg, bool (*result_handler)(void* arg, const ParsedInternalKey& k, - const Slice& v, bool), + const Slice& v), void (*mark_key_may_exist)(void*) = nullptr); uint64_t ApproximateOffsetOf(const Slice& key); diff --git a/table/table_reader.h b/table/table_reader.h index dec0cac3f..3f1391808 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -55,8 +55,8 @@ class TableReader { // Calls (*result_handler)(handle_context, ...) repeatedly, starting with // the entry found after a call to Seek(key), until result_handler returns // false, where k is the actual internal key for a row found and v as the - // value of the key. didIO is true if I/O is involved in the operation. May - // not make such a call if filter policy says that key is not present. + // value of the key. May not make such a call if filter policy says that key + // is not present. // // mark_key_may_exist_handler needs to be called when it is configured to be // memory only and the key is not found in the block cache, with @@ -67,7 +67,7 @@ class TableReader { virtual Status Get( const ReadOptions& readOptions, const Slice& key, void* handle_context, bool (*result_handler)(void* arg, const ParsedInternalKey& k, - const Slice& v, bool didIO), + const Slice& v), void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0; }; diff --git a/table/table_test.cc b/table/table_test.cc index a03c7390b..d2a71a79e 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -1481,6 +1481,14 @@ TEST(BlockBasedTableTest, BlockCacheLeak) { for (const std::string& key : keys) { ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); } + + // rerun with different block cache + opt.block_cache = NewLRUCache(16 * 1024 * 1024); + ASSERT_OK(c.Reopen(opt)); + table_reader = dynamic_cast(c.table_reader()); + for (const std::string& key : keys) { + ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key)); + } } TEST(PlainTableTest, BasicPlainTableProperties) { diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 3c93eca36..8d0b4f5f7 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -81,7 +81,6 @@ def main(argv): --iterpercent=10 --db=%s --max_key=100000000 - --disable_seek_compaction=%s --mmap_read=%s --block_size=16384 --cache_size=1048576 @@ -104,7 +103,6 @@ def main(argv): write_buf_size, dbname, random.randint(0, 1), - random.randint(0, 1), random.randint(0, 1))) child = subprocess.Popen([cmd], diff --git a/tools/db_crashtest2.py b/tools/db_crashtest2.py index 3ef383afc..bca46b853 100644 --- a/tools/db_crashtest2.py +++ b/tools/db_crashtest2.py @@ -100,7 +100,6 @@ def main(argv): --iterpercent=10 --db=%s --max_key=100000000 - --disable_seek_compaction=%s --mmap_read=%s --block_size=16384 --cache_size=1048576 @@ -125,7 +124,6 @@ def main(argv): dbname, random.randint(0, 1), random.randint(0, 1), - random.randint(0, 1), additional_opts)) print "Running:" + cmd + "\n" diff --git a/tools/db_stress.cc b/tools/db_stress.cc index e8f3dee71..929efee3f 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -286,9 +286,6 @@ DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run"); static const bool FLAGS_num_iterations_dummy __attribute__((unused)) = RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range); -DEFINE_bool(disable_seek_compaction, false, - "Option to disable compation triggered by read."); - namespace { enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { assert(ctype); @@ -1585,7 +1582,6 @@ class StressTest { FLAGS_level0_file_num_compaction_trigger; options_.compression = FLAGS_compression_type_e; options_.create_if_missing = true; - options_.disable_seek_compaction = FLAGS_disable_seek_compaction; options_.max_manifest_file_size = 10 * 1024; options_.filter_deletes = FLAGS_filter_deletes; if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) { diff --git a/util/options.cc b/util/options.cc index 8fcc73a4e..ee20e78b9 100644 --- a/util/options.cc +++ b/util/options.cc @@ -382,8 +382,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const { source_compaction_factor); Log(log," Options.max_grandparent_overlap_factor: %d", max_grandparent_overlap_factor); - Log(log," Options.disable_seek_compaction: %d", - disable_seek_compaction); Log(log," Options.no_block_cache: %d", no_block_cache); Log(log," Options.arena_block_size: %zu", @@ -466,7 +464,6 @@ Options::PrepareForBulkLoad() // no auto compactions please. The application should issue a // manual compaction after all data is loaded into L0. disable_auto_compactions = true; - disable_seek_compaction = true; disableDataSync = true; // A manual compaction run should pick all files in L0 in