From c5afbbfe4bf4ed97bf379bc605f3622c5630a5fa Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Mon, 29 Aug 2022 13:36:23 -0700 Subject: [PATCH] Don't wait for indirect flush in read-only DB (#10569) Summary: Some APIs for getting live files, which are used by Checkpoint and BackupEngine, can optionally trigger and wait for a flush. These would deadlock when used on a read-only DB. Here we fix that by assuming the user wants the overall operation to succeed and is OK without flushing (because the DB is read-only). Follow-up work: the same or other issues can be hit by directly invoking some DB functions that are clearly not appropriate for read-only instance, but are not covered by overrides in DBImplReadOnly and CompactedDBImpl. These should be fixed to avoid similar problems on accidental misuse. (Long term, it would be nice to have a DBReadOnly class without those members, like BackupEngineReadOnly.) Pull Request resolved: https://github.com/facebook/rocksdb/pull/10569 Test Plan: tests updated to catch regression (hang before the fix) Reviewed By: riversand963 Differential Revision: D38995759 Pulled By: pdillinger fbshipit-source-id: f5f8bc7123e13cb45bd393dd974d7d6eda20bc68 --- HISTORY.md | 1 + db/db_basic_test.cc | 22 ++++++++++++++++++++++ db/db_impl/compacted_db_impl.h | 12 ++++++++++++ db/db_impl/db_impl.h | 2 +- db/db_impl/db_impl_readonly.h | 11 +++++++++++ db/db_impl/db_impl_secondary.h | 8 ++++++++ include/rocksdb/db.h | 7 ++++--- include/rocksdb/options.h | 3 ++- utilities/backup/backup_engine_test.cc | 7 +++++-- 9 files changed, 66 insertions(+), 7 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 6a62b4121..3b07e26f8 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,7 @@ # Rocksdb Change Log ## Unreleased ### Bug Fixes +* Fixed a hang when an operation such as `GetLiveFiles` or `CreateNewBackup` is asked to trigger and wait for memtable flush on a read-only DB. Such indirect requests for memtable flush are now ignored on a read-only DB. * Fixed bug where `FlushWAL(true /* sync */)` (used by `GetLiveFilesStorageInfo()`, which is used by checkpoint and backup) could cause parallel writes at the tail of a WAL file to never be synced. * Fix periodic_task unable to re-register the same task type, which may cause `SetOptions()` fail to update periodical_task time like: `stats_dump_period_sec`, `stats_persist_period_sec`. diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 9f866a91a..7549087cf 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -253,6 +253,7 @@ TEST_F(DBBasicTest, CompactedDB) { ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b'))); ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e'))); ASSERT_OK(Flush()); + ASSERT_OK(Put("something_not_flushed", "x")); Close(); ASSERT_OK(ReadOnlyReopen(options)); @@ -260,6 +261,20 @@ TEST_F(DBBasicTest, CompactedDB) { s = Put("new", "value"); ASSERT_EQ(s.ToString(), "Not implemented: Not supported operation in read only mode."); + + // TODO: validate that other write ops return NotImplemented + // (DBImplReadOnly is missing some overrides) + + // Ensure no deadlock on flush triggered by another API function + // (Old deadlock bug depends on something_not_flushed above.) + std::vector files; + uint64_t manifest_file_size; + ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true)); + LiveFilesStorageInfoOptions lfsi_opts; + lfsi_opts.wal_size_for_flush = 0; // always + std::vector files2; + ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2)); + Close(); // Full compaction @@ -290,6 +305,13 @@ TEST_F(DBBasicTest, CompactedDB) { ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj")); ASSERT_EQ("NOT_FOUND", Get("kkk")); + // TODO: validate that other write ops return NotImplemented + // (CompactedDB is missing some overrides) + + // Ensure no deadlock on flush triggered by another API function + ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true)); + ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2)); + // MultiGet std::vector values; std::vector status_list = dbfull()->MultiGet( diff --git a/db/db_impl/compacted_db_impl.h b/db/db_impl/compacted_db_impl.h index 5fa412caf..9495c447c 100644 --- a/db/db_impl/compacted_db_impl.h +++ b/db/db_impl/compacted_db_impl.h @@ -11,6 +11,7 @@ namespace ROCKSDB_NAMESPACE { +// TODO: Share common structure with DBImplSecondary and DBImplReadOnly class CompactedDBImpl : public DBImpl { public: CompactedDBImpl(const DBOptions& options, const std::string& dbname); @@ -127,6 +128,17 @@ class CompactedDBImpl : public DBImpl { return Status::NotSupported("Not supported in compacted db mode."); } + // FIXME: some missing overrides for more "write" functions + // Share with DBImplReadOnly? + + protected: +#ifndef ROCKSDB_LITE + Status FlushForGetLiveFiles() override { + // No-op for read-only DB + return Status::OK(); + } +#endif // !ROCKSDB_LITE + private: friend class DB; inline size_t FindFile(const Slice& key); diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 3053d64ac..7cc9bfbf2 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -1387,7 +1387,7 @@ class DBImpl : public DB { void NotifyOnExternalFileIngested( ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job); - Status FlushForGetLiveFiles(); + virtual Status FlushForGetLiveFiles(); #endif // !ROCKSDB_LITE void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const; diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h index 21fa1021f..344fa4720 100644 --- a/db/db_impl/db_impl_readonly.h +++ b/db/db_impl/db_impl_readonly.h @@ -13,6 +13,7 @@ namespace ROCKSDB_NAMESPACE { +// TODO: Share common structure with CompactedDBImpl and DBImplSecondary class DBImplReadOnly : public DBImpl { public: DBImplReadOnly(const DBOptions& options, const std::string& dbname); @@ -141,6 +142,16 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } + // FIXME: some missing overrides for more "write" functions + + protected: +#ifndef ROCKSDB_LITE + Status FlushForGetLiveFiles() override { + // No-op for read-only DB + return Status::OK(); + } +#endif // !ROCKSDB_LITE + private: // A "helper" function for DB::OpenForReadOnly without column families // to reduce unnecessary I/O diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h index 867786ed3..a58ea4449 100644 --- a/db/db_impl/db_impl_secondary.h +++ b/db/db_impl/db_impl_secondary.h @@ -71,6 +71,7 @@ class LogReaderContainer { // The secondary instance can be opened using `DB::OpenAsSecondary`. After // that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best // effort attempts to catch up with the primary. +// TODO: Share common structure with CompactedDBImpl and DBImplReadOnly class DBImplSecondary : public DBImpl { public: DBImplSecondary(const DBOptions& options, const std::string& dbname, @@ -268,6 +269,13 @@ class DBImplSecondary : public DBImpl { #endif // NDEBUG protected: +#ifndef ROCKSDB_LITE + Status FlushForGetLiveFiles() override { + // No-op for read-only DB + return Status::OK(); + } +#endif // !ROCKSDB_LITE + // ColumnFamilyCollector is a write batch handler which does nothing // except recording unique column family IDs class ColumnFamilyCollector : public WriteBatch::Handler { diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index b459aaf1c..b7fcdd786 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1562,9 +1562,10 @@ class DB { // The valid size of the manifest file is returned in manifest_file_size. // The manifest file is an ever growing file, but only the portion specified // by manifest_file_size is valid for this snapshot. Setting flush_memtable - // to true does Flush before recording the live files. Setting flush_memtable - // to false is useful when we don't want to wait for flush which may have to - // wait for compaction to complete taking an indeterminate time. + // to true does Flush before recording the live files (unless DB is + // read-only). Setting flush_memtable to false is useful when we don't want + // to wait for flush which may have to wait for compaction to complete + // taking an indeterminate time. // // NOTE: Although GetLiveFiles() followed by GetSortedWalFiles() can generate // a lossless backup, GetLiveFilesStorageInfo() is strongly recommended diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 0338c7697..afbe9065a 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -2094,7 +2094,8 @@ struct LiveFilesStorageInfoOptions { // Whether to populate FileStorageInfo::file_checksum* or leave blank bool include_checksum_info = false; // Flushes memtables if total size in bytes of live WAL files is >= this - // number. Default: always force a flush without checking sizes. + // number (and DB is not read-only). + // Default: always force a flush without checking sizes. uint64_t wal_size_for_flush = 0; }; #endif // !ROCKSDB_LITE diff --git a/utilities/backup/backup_engine_test.cc b/utilities/backup/backup_engine_test.cc index b804c3a17..4394f3c25 100644 --- a/utilities/backup/backup_engine_test.cc +++ b/utilities/backup/backup_engine_test.cc @@ -2984,9 +2984,12 @@ TEST_F(BackupEngineTest, ReadOnlyBackupEngine) { DestroyDBWithoutCheck(dbname_, options_); OpenDBAndBackupEngine(true); FillDB(db_.get(), 0, 100); - ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + // Also test read-only DB with CreateNewBackup and flush=true (no flush) + CloseAndReopenDB(/*read_only*/ true); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ true)); + CloseAndReopenDB(/*read_only*/ false); FillDB(db_.get(), 100, 200); - ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ true)); CloseDBAndBackupEngine(); DestroyDBWithoutCheck(dbname_, options_);