From f85b31a2e958a65e73797932f23985e398562aa7 Mon Sep 17 00:00:00 2001 From: Akanksha Mahajan Date: Wed, 8 Jun 2022 14:16:43 -0700 Subject: [PATCH] Fix bug for WalManager with compressed WAL (#10130) Summary: RocksDB uses WalManager to manage WAL files. In WalManager::ReadFirstLine(), the assumption is that reading the first record of a valid WAL file will return OK status and set the output sequence to non-zero value. This assumption has been broken by WAL compression which writes a `kSetCompressionType` record which is not associated with any sequence number. Consequently, WalManager::GetSortedWalsOfType() will skip these WALs and not return them to caller, e.g. Checkpoint, Backup, causing the operations to fail. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10130 Test Plan: - Newly Added test Reviewed By: riversand963 Differential Revision: D36985744 Pulled By: akankshamahajan15 fbshipit-source-id: dfde7b3be68b6a30b75b49479779748eedf29f7f --- HISTORY.md | 4 ++-- db/db_wal_test.cc | 37 +++++++++++++++++++++++++++++++++++++ db/log_reader.h | 4 ++++ db/wal_manager.cc | 18 ++++++++++++++---- db/wal_manager.h | 16 ++++++++++++++++ 5 files changed, 73 insertions(+), 6 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 8caf7ce03..e65f204af 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,7 +7,7 @@ * Fix a race condition in WAL size tracking which is caused by an unsafe iterator access after container is changed. * Fix unprotected concurrent accesses to `WritableFileWriter::filesize_` by `DB::SyncWAL()` and `DB::Put()` in two write queue mode. * Fix a bug in WAL tracking. Before this PR (#10087), calling `SyncWAL()` on the only WAL file of the db will not log the event in MANIFEST, thus allowing a subsequent `DB::Open` even if the WAL file is missing or corrupted. - +* Fixed a bug in WAL tracking with wal_compression. WAL compression writes a kSetCompressionType record which is not associated with any sequence number. As result, WalManager::GetSortedWalsOfType() will skip these WALs and not return them to caller, e.g. Checkpoint, Backup, causing the operations to fail. ### Public API changes * Add new API GetUnixTime in Snapshot class which returns the unix time at which Snapshot is taken. @@ -24,7 +24,7 @@ ### New Features * Add FileSystem::ReadAsync API in io_tracing -* Add blob garbage collection parameters `blob_garbage_collection_policy` and `blob_garbage_collection_age_cutoff` to both force-enable and force-disable GC, as well as selectively override age cutoff when using CompactRange. +* Add blob garbage collection parameters `blob_garbage_collection_policy` and `blob_garbage_collection_age_cutoff` to both force-enable and force-disable GC, as well as selectively override age cutoff when using CompactRange. * Add an extra sanity check in `GetSortedWalFiles()` (also used by `GetLiveFilesStorageInfo()`, `BackupEngine`, and `Checkpoint`) to reduce risk of successfully created backup or checkpoint failing to open because of missing WAL file. * Add a new column family option `blob_file_starting_level` to enable writing blob files during flushes and compactions starting from the specified LSM tree level. diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 19c1b334f..e4ea2814a 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -2243,6 +2243,43 @@ TEST_F(DBWALTest, WalTermTest) { ASSERT_EQ("bar", Get(1, "foo")); ASSERT_EQ("NOT_FOUND", Get(1, "foo2")); } + +#ifndef ROCKSDB_LITE +TEST_F(DBWALTest, GetCompressedWalsAfterSync) { + if (db_->GetOptions().wal_compression == kNoCompression) { + ROCKSDB_GTEST_SKIP("stream compression not present"); + return; + } + Options options = GetDefaultOptions(); + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + options.create_if_missing = true; + options.env = env_; + options.avoid_flush_during_recovery = true; + options.track_and_verify_wals_in_manifest = true; + // Enable WAL compression so that the newly-created WAL will be non-empty + // after DB open, even if point-in-time WAL recovery encounters no + // corruption. + options.wal_compression = kZSTD; + DestroyAndReopen(options); + + // Write something to memtable and WAL so that log_empty_ will be false after + // next DB::Open(). + ASSERT_OK(Put("a", "v")); + + Reopen(options); + + // New WAL is created, thanks to !log_empty_. + ASSERT_OK(dbfull()->TEST_SwitchWAL()); + + ASSERT_OK(Put("b", "v")); + + ASSERT_OK(db_->SyncWAL()); + + VectorLogPtr wals; + Status s = dbfull()->GetSortedWalFiles(wals); + ASSERT_OK(s); +} +#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/log_reader.h b/db/log_reader.h index 6f5b98621..dbea81728 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -103,6 +103,10 @@ class Reader { return static_cast(end_of_buffer_offset_); } + bool IsCompressedAndEmptyFile() { + return !first_record_read_ && compression_type_record_read_; + } + protected: std::shared_ptr info_log_; const std::unique_ptr file_; diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 83a3636fb..ed76905d4 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -507,10 +507,20 @@ Status WalManager::ReadFirstLine(const std::string& fname, } } - // ReadRecord might have returned false on EOF, which means that the log file - // is empty. Or, a failure may have occurred while processing the first entry. - // In any case, return status and set sequence number to 0. - *sequence = 0; + if (status.ok() && reader.IsCompressedAndEmptyFile()) { + // In case of wal_compression, it writes a `kSetCompressionType` record + // which is not associated with any sequence number. As result for an empty + // file, GetSortedWalsOfType() will skip these WALs causing the operations + // to fail. + // Therefore, in order to avoid that failure, it sets sequence_number to 1 + // indicating those WALs should be included. + *sequence = 1; + } else { + // ReadRecord might have returned false on EOF, which means that the log + // file is empty. Or, a failure may have occurred while processing the first + // entry. In any case, return status and set sequence number to 0. + *sequence = 0; + } return status; } diff --git a/db/wal_manager.h b/db/wal_manager.h index 743a0ce5f..771c48495 100644 --- a/db/wal_manager.h +++ b/db/wal_manager.h @@ -85,9 +85,25 @@ class WalManager { Status RetainProbableWalFiles(VectorLogPtr& all_logs, const SequenceNumber target); + // ReadFirstRecord checks the read_first_record_cache_ to see if the entry + // exists or not. If not, it will read the WAL file. + // In case of wal_compression, WAL contains a `kSetCompressionType` record + // which is not associated with any sequence number. So the sequence_number is + // set to 1 if that WAL doesn't include any other record (basically empty) in + // order to include that WAL and is inserted in read_first_record_cache_. + // Therefore, sequence_number is used as boolean if WAL should be included or + // not and that sequence_number shouldn't be use for any other purpose. Status ReadFirstRecord(const WalFileType type, const uint64_t number, SequenceNumber* sequence); + // In case of no wal_compression, ReadFirstLine returns status.ok() and + // sequence == 0 if the file exists, but is empty. + // In case of wal_compression, WAL contains + // `kSetCompressionType` record which is not associated with any sequence + // number if that WAL doesn't include any other record (basically empty). As + // result for an empty file, GetSortedWalsOfType() will skip these WALs + // causing the operations to fail. To avoid that, it sets sequence_number to + // 1 inorder to include that WAL. Status ReadFirstLine(const std::string& fname, const uint64_t number, SequenceNumber* sequence);