From b2781522613ab59dee3324abe9fe7a9ead9b77f0 Mon Sep 17 00:00:00 2001 From: Baptiste Lemaire Date: Mon, 2 Aug 2021 20:25:39 -0700 Subject: [PATCH] Fix db stress crash mempurge (#8604) Summary: The db_stress crash was caused by a call to `IsFlushPending()` made by a stats function which triggered an `assert([false])`, which I didn't plan when I created the `trigger_flush` bool. It turns out that this bool variable is not useful: I created it because I thought the `imm_flush_needed` atomic bool would actually trigger a flush. It turns out that this bool is only checked in `IsFlushPending` - this is its only use - and a flush is triggered by either a background thread checking on the imm array, or by an explicit call to `SchedulePendingFlush` which creates a flush request, that is then added to a flush request queue. In this PR, I reverted the MemtableList::Add function to what it was before my changes. I tested the fix by running the exact command line that deterministically triggered the assert error (see below), which confirmed that this is where the error was coming from. I also run `db_crashtest.py whitebox` and `blackbox` for a couple hours locally before committing this PR. Experiment run: ```./db_stress --acquire_snapshot_one_in=0 --allow_concurrent_memtable_write=1 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=1 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=76.90653425292307 --bottommost_compression_type=disable --cache_index_and_filter_blocks=1 --cache_size=1048576 --checkpoint_one_in=1000000 --checksum_type=kCRC32c --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000000 --compact_range_one_in=0 --compaction_ttl=2 --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=1 --compression_type=zstd --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --db=/dev/shm/rocksdb/rocksdb_crashtest_blackbox --db_write_buffer_size=0 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --enable_compaction_filter=1 --enable_pipelined_write=0 --expected_values_path=/dev/shm/rocksdb/rocksdb_crashtest_expected --experimental_allow_mempurge=1 --experimental_mempurge_policy=kAlternate --fail_if_options_file_error=1 --file_checksum_impl=none --flush_one_in=1000000 --format_version=2 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=14 --index_type=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=False --long_running_snapshots=1 --mark_for_compaction_one_file_in=10 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000000 --max_key_len=3 --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtablerep=skip_list --mmap_read=0 --mock_direct_io=True --nooverwritepercent=1 --open_files=-1 --open_metadata_write_fault_one_in=8 --open_read_fault_one_in=32 --open_write_fault_one_in=16 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=0 --partition_filters=0 --partition_pinning=0 --pause_background_one_in=1000000 --periodic_compaction_seconds=1000 --prefix_size=-1 --prefixpercent=0 --progress_reports=0 --read_fault_one_in=0 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --set_options_one_in=0 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=0 --subcompactions=3 --sync=1 --sync_fault_injection=False --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=1 --unpartitioned_pinning=3 --use_clock_cache=0 --use_direct_io_for_flush_and_compaction=1 --use_direct_reads=0 --use_full_merge_v1=1 --use_merge=0 --use_multiget=0 --use_ribbon_filter=1 --user_timestamp_size=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_db_one_in=100000 --write_buffer_size=33554432 --write_dbid_to_manifest=1 --writepercent=35``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/8604 Reviewed By: pdillinger Differential Revision: D30047295 Pulled By: bjlemaire fbshipit-source-id: b9e379bfa3d6b9bd2b275725fb0bca4bd81a3dbe --- db/flush_job.cc | 9 +++------ db/memtable_list.cc | 6 ++---- db/memtable_list.h | 3 +-- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/db/flush_job.cc b/db/flush_job.cc index 9ce20b2b4..3a258a57e 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -577,12 +577,9 @@ Status FlushJob::MemPurge() { } new_mem->SetID(new_mem_id); cfd_->imm()->AddMemPurgeOutputID(new_mem_id); - cfd_->imm()->Add(new_mem, - &job_context_->memtables_to_free, - false /* -> trigger_flush=false: - * adding this memtable - * will not trigger a flush. - */); + // This addition will not trigger another flush, because + // we do not call SchedulePendingFlush(). + cfd_->imm()->Add(new_mem, &job_context_->memtables_to_free); new_mem_capacity = (new_mem->ApproximateMemoryUsage()) * 1.0 / mutable_cf_options_.write_buffer_size; new_mem->Ref(); diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 36bdd98f5..c35ebd5a5 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -539,8 +539,7 @@ Status MemTableList::TryInstallMemtableFlushResults( } // New memtables are inserted at the front of the list. -void MemTableList::Add(MemTable* m, autovector* to_delete, - bool trigger_flush) { +void MemTableList::Add(MemTable* m, autovector* to_delete) { assert(static_cast(current_->memlist_.size()) >= num_flush_not_started_); InstallNewVersion(); // this method is used to move mutable memtable into an immutable list. @@ -551,8 +550,7 @@ void MemTableList::Add(MemTable* m, autovector* to_delete, current_->Add(m, to_delete); m->MarkImmutable(); num_flush_not_started_++; - - if (num_flush_not_started_ > 0 && trigger_flush) { + if (num_flush_not_started_ == 1) { imm_flush_needed.store(true, std::memory_order_release); } UpdateCachedValuesFromMemTableListVersion(); diff --git a/db/memtable_list.h b/db/memtable_list.h index ada6469a1..9ae50e47f 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -275,8 +275,7 @@ class MemTableList { // By default, adding memtables will flag that the memtable list needs to be // flushed, but in certain situations, like after a mempurge, we may want to // avoid flushing the memtable list upon addition of a memtable. - void Add(MemTable* m, autovector* to_delete, - bool trigger_flush = true); + void Add(MemTable* m, autovector* to_delete); // Returns an estimate of the number of bytes of data in use. size_t ApproximateMemoryUsage();