Don't wait for indirect flush in read-only DB (#10569)

Summary:
Some APIs for getting live files, which are used by Checkpoint
and BackupEngine, can optionally trigger and wait for a flush. These
would deadlock when used on a read-only DB. Here we fix that by assuming
the user wants the overall operation to succeed and is OK without
flushing (because the DB is read-only).

Follow-up work: the same or other issues can be hit by directly invoking
some DB functions that are clearly not appropriate for read-only
instance, but are not covered by overrides in DBImplReadOnly and
CompactedDBImpl. These should be fixed to avoid similar problems on
accidental misuse. (Long term, it would be nice to have a DBReadOnly
class without those members, like BackupEngineReadOnly.)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10569

Test Plan: tests updated to catch regression (hang before the fix)

Reviewed By: riversand963

Differential Revision: D38995759

Pulled By: pdillinger

fbshipit-source-id: f5f8bc7123e13cb45bd393dd974d7d6eda20bc68
main
Peter Dillinger 2 years ago committed by Facebook GitHub Bot
parent 5532b462c4
commit c5afbbfe4b
  1. 1
      HISTORY.md
  2. 22
      db/db_basic_test.cc
  3. 12
      db/db_impl/compacted_db_impl.h
  4. 2
      db/db_impl/db_impl.h
  5. 11
      db/db_impl/db_impl_readonly.h
  6. 8
      db/db_impl/db_impl_secondary.h
  7. 7
      include/rocksdb/db.h
  8. 3
      include/rocksdb/options.h
  9. 7
      utilities/backup/backup_engine_test.cc

@ -1,6 +1,7 @@
# Rocksdb Change Log # Rocksdb Change Log
## Unreleased ## Unreleased
### Bug Fixes ### Bug Fixes
* Fixed a hang when an operation such as `GetLiveFiles` or `CreateNewBackup` is asked to trigger and wait for memtable flush on a read-only DB. Such indirect requests for memtable flush are now ignored on a read-only DB.
* Fixed bug where `FlushWAL(true /* sync */)` (used by `GetLiveFilesStorageInfo()`, which is used by checkpoint and backup) could cause parallel writes at the tail of a WAL file to never be synced. * Fixed bug where `FlushWAL(true /* sync */)` (used by `GetLiveFilesStorageInfo()`, which is used by checkpoint and backup) could cause parallel writes at the tail of a WAL file to never be synced.
* Fix periodic_task unable to re-register the same task type, which may cause `SetOptions()` fail to update periodical_task time like: `stats_dump_period_sec`, `stats_persist_period_sec`. * Fix periodic_task unable to re-register the same task type, which may cause `SetOptions()` fail to update periodical_task time like: `stats_dump_period_sec`, `stats_persist_period_sec`.

@ -253,6 +253,7 @@ TEST_F(DBBasicTest, CompactedDB) {
ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b'))); ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e'))); ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
ASSERT_OK(Flush()); ASSERT_OK(Flush());
ASSERT_OK(Put("something_not_flushed", "x"));
Close(); Close();
ASSERT_OK(ReadOnlyReopen(options)); ASSERT_OK(ReadOnlyReopen(options));
@ -260,6 +261,20 @@ TEST_F(DBBasicTest, CompactedDB) {
s = Put("new", "value"); s = Put("new", "value");
ASSERT_EQ(s.ToString(), ASSERT_EQ(s.ToString(),
"Not implemented: Not supported operation in read only mode."); "Not implemented: Not supported operation in read only mode.");
// TODO: validate that other write ops return NotImplemented
// (DBImplReadOnly is missing some overrides)
// Ensure no deadlock on flush triggered by another API function
// (Old deadlock bug depends on something_not_flushed above.)
std::vector<std::string> files;
uint64_t manifest_file_size;
ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true));
LiveFilesStorageInfoOptions lfsi_opts;
lfsi_opts.wal_size_for_flush = 0; // always
std::vector<LiveFileStorageInfo> files2;
ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2));
Close(); Close();
// Full compaction // Full compaction
@ -290,6 +305,13 @@ TEST_F(DBBasicTest, CompactedDB) {
ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj")); ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj"));
ASSERT_EQ("NOT_FOUND", Get("kkk")); ASSERT_EQ("NOT_FOUND", Get("kkk"));
// TODO: validate that other write ops return NotImplemented
// (CompactedDB is missing some overrides)
// Ensure no deadlock on flush triggered by another API function
ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true));
ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2));
// MultiGet // MultiGet
std::vector<std::string> values; std::vector<std::string> values;
std::vector<Status> status_list = dbfull()->MultiGet( std::vector<Status> status_list = dbfull()->MultiGet(

@ -11,6 +11,7 @@
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
// TODO: Share common structure with DBImplSecondary and DBImplReadOnly
class CompactedDBImpl : public DBImpl { class CompactedDBImpl : public DBImpl {
public: public:
CompactedDBImpl(const DBOptions& options, const std::string& dbname); CompactedDBImpl(const DBOptions& options, const std::string& dbname);
@ -127,6 +128,17 @@ class CompactedDBImpl : public DBImpl {
return Status::NotSupported("Not supported in compacted db mode."); return Status::NotSupported("Not supported in compacted db mode.");
} }
// FIXME: some missing overrides for more "write" functions
// Share with DBImplReadOnly?
protected:
#ifndef ROCKSDB_LITE
Status FlushForGetLiveFiles() override {
// No-op for read-only DB
return Status::OK();
}
#endif // !ROCKSDB_LITE
private: private:
friend class DB; friend class DB;
inline size_t FindFile(const Slice& key); inline size_t FindFile(const Slice& key);

@ -1387,7 +1387,7 @@ class DBImpl : public DB {
void NotifyOnExternalFileIngested( void NotifyOnExternalFileIngested(
ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job); ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job);
Status FlushForGetLiveFiles(); virtual Status FlushForGetLiveFiles();
#endif // !ROCKSDB_LITE #endif // !ROCKSDB_LITE
void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const; void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;

@ -13,6 +13,7 @@
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
// TODO: Share common structure with CompactedDBImpl and DBImplSecondary
class DBImplReadOnly : public DBImpl { class DBImplReadOnly : public DBImpl {
public: public:
DBImplReadOnly(const DBOptions& options, const std::string& dbname); DBImplReadOnly(const DBOptions& options, const std::string& dbname);
@ -141,6 +142,16 @@ class DBImplReadOnly : public DBImpl {
return Status::NotSupported("Not supported operation in read only mode."); return Status::NotSupported("Not supported operation in read only mode.");
} }
// FIXME: some missing overrides for more "write" functions
protected:
#ifndef ROCKSDB_LITE
Status FlushForGetLiveFiles() override {
// No-op for read-only DB
return Status::OK();
}
#endif // !ROCKSDB_LITE
private: private:
// A "helper" function for DB::OpenForReadOnly without column families // A "helper" function for DB::OpenForReadOnly without column families
// to reduce unnecessary I/O // to reduce unnecessary I/O

@ -71,6 +71,7 @@ class LogReaderContainer {
// The secondary instance can be opened using `DB::OpenAsSecondary`. After // The secondary instance can be opened using `DB::OpenAsSecondary`. After
// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best // that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best
// effort attempts to catch up with the primary. // effort attempts to catch up with the primary.
// TODO: Share common structure with CompactedDBImpl and DBImplReadOnly
class DBImplSecondary : public DBImpl { class DBImplSecondary : public DBImpl {
public: public:
DBImplSecondary(const DBOptions& options, const std::string& dbname, DBImplSecondary(const DBOptions& options, const std::string& dbname,
@ -268,6 +269,13 @@ class DBImplSecondary : public DBImpl {
#endif // NDEBUG #endif // NDEBUG
protected: protected:
#ifndef ROCKSDB_LITE
Status FlushForGetLiveFiles() override {
// No-op for read-only DB
return Status::OK();
}
#endif // !ROCKSDB_LITE
// ColumnFamilyCollector is a write batch handler which does nothing // ColumnFamilyCollector is a write batch handler which does nothing
// except recording unique column family IDs // except recording unique column family IDs
class ColumnFamilyCollector : public WriteBatch::Handler { class ColumnFamilyCollector : public WriteBatch::Handler {

@ -1562,9 +1562,10 @@ class DB {
// The valid size of the manifest file is returned in manifest_file_size. // The valid size of the manifest file is returned in manifest_file_size.
// The manifest file is an ever growing file, but only the portion specified // The manifest file is an ever growing file, but only the portion specified
// by manifest_file_size is valid for this snapshot. Setting flush_memtable // by manifest_file_size is valid for this snapshot. Setting flush_memtable
// to true does Flush before recording the live files. Setting flush_memtable // to true does Flush before recording the live files (unless DB is
// to false is useful when we don't want to wait for flush which may have to // read-only). Setting flush_memtable to false is useful when we don't want
// wait for compaction to complete taking an indeterminate time. // to wait for flush which may have to wait for compaction to complete
// taking an indeterminate time.
// //
// NOTE: Although GetLiveFiles() followed by GetSortedWalFiles() can generate // NOTE: Although GetLiveFiles() followed by GetSortedWalFiles() can generate
// a lossless backup, GetLiveFilesStorageInfo() is strongly recommended // a lossless backup, GetLiveFilesStorageInfo() is strongly recommended

@ -2094,7 +2094,8 @@ struct LiveFilesStorageInfoOptions {
// Whether to populate FileStorageInfo::file_checksum* or leave blank // Whether to populate FileStorageInfo::file_checksum* or leave blank
bool include_checksum_info = false; bool include_checksum_info = false;
// Flushes memtables if total size in bytes of live WAL files is >= this // Flushes memtables if total size in bytes of live WAL files is >= this
// number. Default: always force a flush without checking sizes. // number (and DB is not read-only).
// Default: always force a flush without checking sizes.
uint64_t wal_size_for_flush = 0; uint64_t wal_size_for_flush = 0;
}; };
#endif // !ROCKSDB_LITE #endif // !ROCKSDB_LITE

@ -2984,9 +2984,12 @@ TEST_F(BackupEngineTest, ReadOnlyBackupEngine) {
DestroyDBWithoutCheck(dbname_, options_); DestroyDBWithoutCheck(dbname_, options_);
OpenDBAndBackupEngine(true); OpenDBAndBackupEngine(true);
FillDB(db_.get(), 0, 100); FillDB(db_.get(), 0, 100);
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); // Also test read-only DB with CreateNewBackup and flush=true (no flush)
CloseAndReopenDB(/*read_only*/ true);
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ true));
CloseAndReopenDB(/*read_only*/ false);
FillDB(db_.get(), 100, 200); FillDB(db_.get(), 100, 200);
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ true));
CloseDBAndBackupEngine(); CloseDBAndBackupEngine();
DestroyDBWithoutCheck(dbname_, options_); DestroyDBWithoutCheck(dbname_, options_);

Loading…
Cancel
Save