From 0a89cea5f55b6a57036236cc5995a0634c980735 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Mon, 28 Feb 2022 23:45:08 -0800 Subject: [PATCH] Handle failures in block-based table size/offset approximation (#9615) Summary: In crash test with fault injection, we were seeing stack traces like the following: ``` https://github.com/facebook/rocksdb/issues/3 0x00007f75f763c533 in __GI___assert_fail (assertion=assertion@entry=0x1c5b2a0 "end_offset >= start_offset", file=file@entry=0x1c580a0 "table/block_based/block_based_table_reader.cc", line=line@entry=3245, function=function@entry=0x1c60e60 "virtual uint64_t rocksdb::BlockBasedTable::ApproximateSize(const rocksdb::Slice&, const rocksdb::Slice&, rocksdb::TableReaderCaller)") at assert.c:101 https://github.com/facebook/rocksdb/issues/4 0x00000000010ea9b4 in rocksdb::BlockBasedTable::ApproximateSize (this=, start=..., end=..., caller=) at table/block_based/block_based_table_reader.cc:3224 https://github.com/facebook/rocksdb/issues/5 0x0000000000be61fb in rocksdb::TableCache::ApproximateSize (this=0x60f0000161b0, start=..., end=..., fd=..., caller=caller@entry=rocksdb::kCompaction, internal_comparator=..., prefix_extractor=...) at db/table_cache.cc:719 https://github.com/facebook/rocksdb/issues/6 0x0000000000c3eaec in rocksdb::VersionSet::ApproximateSize (this=, v=, f=..., start=..., end=..., caller=) at ./db/version_set.h:850 https://github.com/facebook/rocksdb/issues/7 0x0000000000c6ebc3 in rocksdb::VersionSet::ApproximateSize (this=, options=..., v=v@entry=0x621000047500, start=..., end=..., start_level=start_level@entry=0, end_level=, caller=) at db/version_set.cc:5657 https://github.com/facebook/rocksdb/issues/8 0x000000000166e894 in rocksdb::CompactionJob::GenSubcompactionBoundaries (this=) at ./include/rocksdb/options.h:1869 https://github.com/facebook/rocksdb/issues/9 0x000000000168c526 in rocksdb::CompactionJob::Prepare (this=this@entry=0x7f75f3ffcf00) at db/compaction/compaction_job.cc:546 ``` The problem occurred in `ApproximateSize()` when the index `Seek()` for the first `ApproximateDataOffsetOf()` encountered an I/O error, while the second `Seek()` did not. In the old code that scenario caused `start_offset == data_size` , thus it was easy to trip the assertion that `end_offset >= start_offset`. The fix is to set `start_offset == 0` when the first index `Seek()` fails, and `end_offset == data_size` when the second index `Seek()` fails. I doubt these give an "on average correct" answer for how this function is used, but I/O errors in index seeks are hopefully rare, it looked consistent with what was already there, and it was easier to calculate. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9615 Test Plan: run the repro command for a while and stopped seeing coredumps - ``` $ while ! ./db_stress --block_size=128 --cache_size=32768 --clear_column_family_one_in=0 --column_families=1 --continuous_verification_interval=0 --db=/dev/shm/rocksdb_crashtest --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --expected_values_dir=/dev/shm/rocksdb_crashtest_expected --index_type=2 --iterpercent=10 --kill_random_test=18887 --max_key=1000000 --max_bytes_for_level_base=2048576 --nooverwritepercent=1 --open_files=-1 --open_read_fault_one_in=32 --ops_per_thread=1000000 --prefixpercent=5 --read_fault_one_in=0 --readpercent=45 --reopen=0 --skip_verifydb=1 --subcompactions=2 --target_file_size_base=524288 --test_batches_snapshots=0 --value_size_mult=32 --write_buffer_size=524288 --writepercent=35 ; do : ; done ``` Reviewed By: pdillinger Differential Revision: D34383069 Pulled By: ajkr fbshipit-source-id: fac26c3b20ea962e75387515ba5f2724dc48719f --- table/block_based/block_based_table_reader.cc | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 4908a03af..686f38ba1 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -3158,6 +3158,7 @@ Status BlockBasedTable::CreateIndexReader( uint64_t BlockBasedTable::ApproximateDataOffsetOf( const InternalIteratorBase& index_iter, uint64_t data_size) const { + assert(index_iter.status().ok()); if (index_iter.Valid()) { BlockHandle handle = index_iter.value().handle; return handle.offset(); @@ -3200,8 +3201,16 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, } index_iter->Seek(key); + uint64_t offset; + if (index_iter->status().ok()) { + offset = ApproximateDataOffsetOf(*index_iter, data_size); + } else { + // Split in half to avoid skewing one way or another, + // since we don't know whether we're operating on lower bound or + // upper bound. + return rep_->file_size / 2; + } - uint64_t offset = ApproximateDataOffsetOf(*index_iter, data_size); // Pro-rate file metadata (incl filters) size-proportionally across data // blocks. double size_ratio = @@ -3217,7 +3226,9 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end, uint64_t data_size = GetApproximateDataSize(); if (UNLIKELY(data_size == 0)) { // Hmm. Assume whole file is involved, since we have lower and upper - // bound. + // bound. This likely skews the estimate if we consider that this function + // is typically called with `[start, end]` fully contained in the file's + // key-range. return rep_->file_size; } @@ -3235,9 +3246,24 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end, } index_iter->Seek(start); - uint64_t start_offset = ApproximateDataOffsetOf(*index_iter, data_size); + uint64_t start_offset; + if (index_iter->status().ok()) { + start_offset = ApproximateDataOffsetOf(*index_iter, data_size); + } else { + // Assume file is involved from the start. This likely skews the estimate + // but is consistent with the above error handling. + start_offset = 0; + } + index_iter->Seek(end); - uint64_t end_offset = ApproximateDataOffsetOf(*index_iter, data_size); + uint64_t end_offset; + if (index_iter->status().ok()) { + end_offset = ApproximateDataOffsetOf(*index_iter, data_size); + } else { + // Assume file is involved until the end. This likely skews the estimate + // but is consistent with the above error handling. + end_offset = data_size; + } assert(end_offset >= start_offset); // Pro-rate file metadata (incl filters) size-proportionally across data