From c2404d9928e28b064c0a2e72cfdfc7a369678c12 Mon Sep 17 00:00:00 2001 From: Eli Pozniansky Date: Fri, 16 Aug 2019 14:16:49 -0700 Subject: [PATCH] Optimizing ApproximateSize to create index iterator just once (#5693) Summary: VersionSet::ApproximateSize doesn't need to create two separate index iterators and do binary search for each in BlockBasedTable. So BlockBasedTable::ApproximateSize was added that creates the iterator once and uses it to calculate the data size between start and end keys. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5693 Differential Revision: D16774056 Pulled By: elipoz fbshipit-source-id: 53ce262e1a057788243bf30cd9b8aa6581df1a18 --- db/table_cache.cc | 27 ++++++++ db/table_cache.h | 7 +++ db/version_set.cc | 55 +++++++++++++--- db/version_set.h | 9 ++- table/block_based/block_based_table_reader.cc | 62 ++++++++++++++----- table/block_based/block_based_table_reader.h | 13 +++- table/cuckoo/cuckoo_table_reader.h | 6 ++ table/mock_table.h | 5 ++ table/plain/plain_table_reader.cc | 6 ++ table/plain/plain_table_reader.h | 3 + table/table_reader.h | 6 ++ 11 files changed, 170 insertions(+), 29 deletions(-) diff --git a/db/table_cache.cc b/db/table_cache.cc index 3c8a36c3b..f4de3b8fb 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -553,4 +553,31 @@ uint64_t TableCache::ApproximateOffsetOf( return result; } + +uint64_t TableCache::ApproximateSize( + const Slice& start, const Slice& end, const FileDescriptor& fd, + TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + const SliceTransform* prefix_extractor) { + uint64_t result = 0; + TableReader* table_reader = fd.table_reader; + Cache::Handle* table_handle = nullptr; + if (table_reader == nullptr) { + const bool for_compaction = (caller == TableReaderCaller::kCompaction); + Status s = FindTable(env_options_, internal_comparator, fd, &table_handle, + prefix_extractor, false /* no_io */, + !for_compaction /* record_read_stats */); + if (s.ok()) { + table_reader = GetTableReaderFromHandle(table_handle); + } + } + + if (table_reader != nullptr) { + result = table_reader->ApproximateSize(start, end, caller); + } + if (table_handle != nullptr) { + ReleaseHandle(table_handle); + } + + return result; +} } // namespace rocksdb diff --git a/db/table_cache.h b/db/table_cache.h index ff9a70b57..f7e0b0b35 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -167,6 +167,13 @@ class TableCache { const InternalKeyComparator& internal_comparator, const SliceTransform* prefix_extractor = nullptr); + // Returns approximated data size between start and end keys in a file + // represented by fd (the start key must not be greater than the end key). + uint64_t ApproximateSize(const Slice& start, const Slice& end, + const FileDescriptor& fd, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const SliceTransform* prefix_extractor = nullptr); + // Release the handle from a cache void ReleaseHandle(Cache::Handle* handle); diff --git a/db/version_set.cc b/db/version_set.cc index d1216646a..e94ad3d5a 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -5011,7 +5011,7 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, uint64_t file_size = files_brief.files[i].fd.GetFileSize(); // The entire file falls into the range, so we can just take its size. assert(file_size == - ApproximateSize(v, files_brief.files[i], end, caller)); + ApproximateSize(v, files_brief.files[i], start, end, caller)); total_full_size += file_size; } @@ -5045,25 +5045,23 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, } else { // Estimate for all the first files, at each level for (const auto file_ptr : first_files) { - total_full_size += ApproximateSize(v, *file_ptr, end, caller); - // subtract the bytes needed to be scanned to get to the starting key - uint64_t val = ApproximateSize(v, *file_ptr, start, caller); - assert(total_full_size >= val); - total_full_size -= val; + total_full_size += ApproximateSize(v, *file_ptr, start, end, caller); } // Estimate for all the last files, at each level for (const auto file_ptr : last_files) { - total_full_size += ApproximateSize(v, *file_ptr, end, caller); + // We could use ApproximateSize here, but calling ApproximateOffsetOf + // directly is just more efficient. + total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller); } } return total_full_size; } -uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, - const Slice& key, - TableReaderCaller caller) { +uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, + const Slice& key, + TableReaderCaller caller) { // pre-condition assert(v); const auto& icmp = v->cfd_->internal_comparator(); @@ -5088,6 +5086,43 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, return result; } +uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, + const Slice& start, const Slice& end, + TableReaderCaller caller) { + // pre-condition + assert(v); + const auto& icmp = v->cfd_->internal_comparator(); + assert(icmp.Compare(start, end) <= 0); + + if (icmp.Compare(f.largest_key, start) <= 0 || + icmp.Compare(f.smallest_key, end) > 0) { + // Entire file is before or after the start/end keys range + return 0; + } + + if (icmp.Compare(f.smallest_key, start) >= 0) { + // Start of the range is before the file start - approximate by end offset + return ApproximateOffsetOf(v, f, end, caller); + } + + if (icmp.Compare(f.largest_key, end) < 0) { + // End of the range is after the file end - approximate by subtracting + // start offset from the file size + uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller); + assert(f.fd.GetFileSize() >= start_offset); + return f.fd.GetFileSize() - start_offset; + } + + // The interval falls entirely in the range for this file. + TableCache* table_cache = v->cfd_->table_cache(); + if (table_cache == nullptr) { + return 0; + } + return table_cache->ApproximateSize( + start, end, f.file_metadata->fd, caller, icmp, + v->GetMutableCFOptions().prefix_extractor.get()); +} + void VersionSet::AddLiveFiles(std::vector* live_list) { // pre-calculate space requirement int64_t total_files = 0; diff --git a/db/version_set.h b/db/version_set.h index 3b4d5661c..028c6ad1b 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1043,8 +1043,15 @@ class VersionSet { } }; + // Returns approximated offset of a key in a file for a given version. + uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, + const Slice& key, TableReaderCaller caller); + + // Returns approximated data size between start and end keys in a file + // for a given version. uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, - const Slice& key, TableReaderCaller caller); + const Slice& start, const Slice& end, + TableReaderCaller caller); // Save current contents to *log Status WriteSnapshot(log::Writer* log); diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index bb12188d3..119de00f0 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -3921,25 +3921,16 @@ Status BlockBasedTable::CreateIndexReader( } } -uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, - TableReaderCaller caller) { - BlockCacheLookupContext context(caller); - IndexBlockIter iiter_on_stack; - auto index_iter = - NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, - /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, - /*lookup_context=*/&context); - - index_iter->Seek(key); - uint64_t result; - if (index_iter->Valid()) { - BlockHandle handle = index_iter->value().handle; +uint64_t BlockBasedTable::ApproximateOffsetOf( + const InternalIteratorBase& index_iter) const { + uint64_t result = 0; + if (index_iter.Valid()) { + BlockHandle handle = index_iter.value().handle; result = handle.offset(); } else { - // key is past the last key in the file. If table_properties is not + // The iterator is past the last key in the file. If table_properties is not // available, approximate the offset by returning the offset of the // metaindex block (which is right near the end of the file). - result = 0; if (rep_->table_properties) { result = rep_->table_properties->data_size; } @@ -3949,11 +3940,48 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, } } + return result; +} + +uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, + TableReaderCaller caller) { + BlockCacheLookupContext context(caller); + IndexBlockIter iiter_on_stack; + auto index_iter = + NewIndexIterator(ReadOptions(), /*disable_prefix_seek=*/false, + /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, + /*lookup_context=*/&context); + std::unique_ptr> iiter_unique_ptr; if (index_iter != &iiter_on_stack) { - delete index_iter; + iiter_unique_ptr.reset(index_iter); } - return result; + index_iter->Seek(key); + return ApproximateOffsetOf(*index_iter); +} + +uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end, + TableReaderCaller caller) { + assert(rep_->internal_comparator.Compare(start, end) <= 0); + + BlockCacheLookupContext context(caller); + IndexBlockIter iiter_on_stack; + auto index_iter = + NewIndexIterator(ReadOptions(), /*disable_prefix_seek=*/false, + /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, + /*lookup_context=*/&context); + std::unique_ptr> iiter_unique_ptr; + if (index_iter != &iiter_on_stack) { + iiter_unique_ptr.reset(index_iter); + } + + index_iter->Seek(start); + uint64_t start_offset = ApproximateOffsetOf(*index_iter); + index_iter->Seek(end); + uint64_t end_offset = ApproximateOffsetOf(*index_iter); + + assert(end_offset >= start_offset); + return end_offset - start_offset; } bool BlockBasedTable::TEST_FilterBlockInCache() const { diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 017199d80..2b0c4754a 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -151,13 +151,20 @@ class BlockBasedTable : public TableReader { // Given a key, return an approximate byte offset in the file where // the data for that key begins (or would begin if the key were - // present in the file). The returned value is in terms of file + // present in the file). The returned value is in terms of file // bytes, and so includes effects like compression of the underlying data. // E.g., the approximate offset of the last key in the table will // be close to the file length. uint64_t ApproximateOffsetOf(const Slice& key, TableReaderCaller caller) override; + // Given start and end keys, return the approximate data size in the file + // between the keys. The returned value is in terms of file bytes, and so + // includes effects like compression of the underlying data. + // The start key must not be greater than the end key. + uint64_t ApproximateSize(const Slice& start, const Slice& end, + TableReaderCaller caller) override; + bool TEST_BlockInCache(const BlockHandle& handle) const; // Returns true if the block for the specified key is in cache. @@ -438,6 +445,10 @@ class BlockBasedTable : public TableReader { static void GenerateCachePrefix(Cache* cc, WritableFile* file, char* buffer, size_t* size); + // Given an iterator return its offset in file. + uint64_t ApproximateOffsetOf( + const InternalIteratorBase& index_iter) const; + // Helper functions for DumpTable() Status DumpIndexBlock(WritableFile* out_file); Status DumpDataBlocks(WritableFile* out_file); diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h index 10db08425..ea33ffb2a 100644 --- a/table/cuckoo/cuckoo_table_reader.h +++ b/table/cuckoo/cuckoo_table_reader.h @@ -62,6 +62,12 @@ class CuckooTableReader: public TableReader { TableReaderCaller /*caller*/) override { return 0; } + + uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, + TableReaderCaller /*caller*/) override { + return 0; + } + void SetupForCompaction() override {} // End of methods not implemented. diff --git a/table/mock_table.h b/table/mock_table.h index 4b886e63e..deb383231 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -55,6 +55,11 @@ class MockTableReader : public TableReader { return 0; } + uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, + TableReaderCaller /*caller*/) override { + return 0; + } + size_t ApproximateMemoryUsage() const override { return 0; } void SetupForCompaction() override {} diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index f6c348fdb..63a28e34a 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -620,6 +620,12 @@ uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/, return 0; } +uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/, + const Slice& /*end*/, + TableReaderCaller /*caller*/) { + return 0; +} + PlainTableIterator::PlainTableIterator(PlainTableReader* table, bool use_prefix_seek) : table_(table), diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h index f63649cac..ab108b216 100644 --- a/table/plain/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -94,6 +94,9 @@ class PlainTableReader: public TableReader { uint64_t ApproximateOffsetOf(const Slice& key, TableReaderCaller caller) override; + uint64_t ApproximateSize(const Slice& start, const Slice& end, + TableReaderCaller caller) override; + uint32_t GetIndexSize() const { return index_.GetIndexSize(); } void SetupForCompaction() override; diff --git a/table/table_reader.h b/table/table_reader.h index eb383c8fe..e968014b1 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -65,6 +65,12 @@ class TableReader { virtual uint64_t ApproximateOffsetOf(const Slice& key, TableReaderCaller caller) = 0; + // Given start and end keys, return the approximate data size in the file + // between the keys. The returned value is in terms of file bytes, and so + // includes effects like compression of the underlying data. + virtual uint64_t ApproximateSize(const Slice& start, const Slice& end, + TableReaderCaller caller) = 0; + // Set up the table for Compaction. Might change some parameters with // posix_fadvise virtual void SetupForCompaction() = 0;