diff --git a/HISTORY.md b/HISTORY.md index f4ddfef87..556b25782 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -13,13 +13,12 @@ ### Public API Change * `DB::GetDbSessionId(std::string& session_id)` is added. `session_id` stores a unique identifier that gets reset every time the DB is opened. This DB session ID should be unique among all open DB instances on all hosts, and should be unique among re-openings of the same or other DBs. This identifier is recorded in the LOG file on the line starting with "DB Session ID:". * `DB::OpenForReadOnly()` now returns `Status::NotFound` when the specified DB directory does not exist. Previously the error returned depended on the underlying `Env`. This change is available in all 6.11 releases as well. -* A parameter `verify_with_checksum` is added to `BackupEngine::VerifyBackup`, which is true by default. So now `BackupEngine::VerifyBackup` verifies checksums and file sizes of backup files by default. Pass `false` for `verify_with_checksum` to maintain the previous behavior and performance of `BackupEngine::VerifyBackup`, by only verifying sizes of backup files. +* A parameter `verify_with_checksum` is added to `BackupEngine::VerifyBackup`, which is false by default. If it is ture, `BackupEngine::VerifyBackup` verifies checksums and file sizes of backup files. Pass `false` for `verify_with_checksum` to maintain the previous behavior and performance of `BackupEngine::VerifyBackup`, by only verifying sizes of backup files. ### New Features * DB identity (`db_id`) and DB session identity (`db_session_id`) are added to table properties and stored in SST files. SST files generated from SstFileWriter and Repairer have DB identity “SST Writer” and “DB Repairer”, respectively. Their DB session IDs are generated in the same way as `DB::GetDbSessionId`. The session ID for SstFileWriter (resp., Repairer) resets every time `SstFileWriter::Open` (resp., `Repairer::Run`) is called. * Added experimental option BlockBasedTableOptions::optimize_filters_for_memory for reducing allocated memory size of Bloom filters (~10% savings with Jemalloc) while preserving the same general accuracy. To have an effect, the option requires format_version=5 and malloc_usable_size. Enabling this option is forward and backward compatible with existing format_version=5. -* `BackupableDBOptions::new_naming_for_backup_files` is added. This option is true by default. When it is true, backup table filenames are of the form `__.sst` as opposed to `__.sst`. When there is no `db_session_id` available in the table file, we use `file_size` as a fallback. Note that when this option is true, it comes into effect only when both `share_files_with_checksum` and `share_table_files` are true. -* Added compaction filter support for BlobDB non-TTL values. Same as vanilla RocksDB, user compaction filter applies to all k/v pairs of the compaction for non-TTL values. It honors `min_blob_size`, which potentially results value transitions between inlined data and stored-in-blob data when size of value is changed. +* `BackupTableNameOption BackupableDBOptions::share_files_with_checksum_naming` is added, where `BackupTableNameOption` is an `enum` type with two enumerators `kChecksumAndFileSize` and `kChecksumAndFileSize`. By default, `BackupTableNameOption BackupableDBOptions::share_files_with_checksum_naming` is set to `kChecksumAndDbSessionId`. In this default case, backup table filenames are of the form `__.sst` as opposed to `__.sst`. The new default behavior fixes the backup file name collision problem, which might be possible at large scale, but the option `kChecksumAndFileSize` is added to allow use of old naming in case it is needed. This default behavior change is not an upgrade issue, because previous versions of RocksDB can read, restore, and delete backups using new names, and it's OK for a backup directory to use a mixture of table file naming schemes. Note that `share_files_with_checksum_naming` comes into effect only when both `share_files_with_checksum` and `share_table_files` are true. ### Bug Fixes * Fail recovery and report once hitting a physical log record checksum mismatch, while reading MANIFEST. RocksDB should not continue processing the MANIFEST any further. diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h index 76b5e549e..3e2c6337f 100644 --- a/include/rocksdb/utilities/backupable_db.h +++ b/include/rocksdb/utilities/backupable_db.h @@ -24,6 +24,19 @@ namespace ROCKSDB_NAMESPACE { +// BackupTableNameOption describes possible naming schemes for backup +// table file names when the table files are stored in the shared_checksum +// directory (i.e., both share_table_files and share_files_with_checksum +// are true). +enum BackupTableNameOption : unsigned char { + // Backup SST filenames consist of file_number, crc32c, db_session_id + kChecksumAndFileSize = 0, + // Backup SST filenames consist of file_number, crc32c, file_size + // When there is no `db_session_id` available in the table file, we use + // `file_size` as a fallback. + kChecksumAndDbSessionId = 1 +}; + struct BackupableDBOptions { // Where to keep the backup files. Has to be different than dbname_ // Best to set this to dbname_ + "/backups" @@ -93,9 +106,9 @@ struct BackupableDBOptions { // (file name, crc32c, db session id or file length) // // Note: If this option is set to true, we recommend setting - // new_naming_for_backup_files to true as well, which is also our default - // option. Otherwise, there is a non-negligible chance of filename collision - // when sharing tables in shared_checksum among several DBs. + // share_files_with_checksum_naming to kChecksumAndDbSessionId, which is also + // our default option. Otherwise, there is a non-negligible chance of filename + // collision when sharing tables in shared_checksum among several DBs. // *turn it on only if you know what you're doing* // // Default: false @@ -122,16 +135,24 @@ struct BackupableDBOptions { // Default: INT_MAX int max_valid_backups_to_open; - // If true, backup SST filenames consist of file_number, crc32c, db_session_id - // if false, backup SST filenames consist of file_number, crc32c, file_size + // Naming option for share_files_with_checksum table files. This option + // can be set to kChecksumAndFileSize or kChecksumAndDbSessionId. + // kChecksumAndFileSize is susceptible to collision as file size is not a + // good source of entroy. + // kChecksumAndDbSessionId is immune to collision. // - // Default: true + // Modifying this option cannot introduce a downgrade compatibility issue + // because RocksDB can read, restore, and delete backups using different file + // names, and it's OK for a backup directory to use a mixture of table file + // naming schemes. + // + // Default: kChecksumAndDbSessionId // // Note: This option comes into effect only if both share_files_with_checksum // and share_table_files are true. In the cases of old table files where no // db_session_id is stored, we use the file_size to replace the empty // db_session_id as a fallback. - bool new_naming_for_backup_files; + BackupTableNameOption share_files_with_checksum_naming; void Dump(Logger* logger) const; @@ -143,7 +164,8 @@ struct BackupableDBOptions { uint64_t _restore_rate_limit = 0, int _max_background_operations = 1, uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024, int _max_valid_backups_to_open = INT_MAX, - bool _new_naming_for_backup_files = true) + BackupTableNameOption _share_files_with_checksum_naming = + kChecksumAndDbSessionId) : backup_dir(_backup_dir), backup_env(_backup_env), share_table_files(_share_table_files), @@ -157,7 +179,7 @@ struct BackupableDBOptions { max_background_operations(_max_background_operations), callback_trigger_interval_size(_callback_trigger_interval_size), max_valid_backups_to_open(_max_valid_backups_to_open), - new_naming_for_backup_files(_new_naming_for_backup_files) { + share_files_with_checksum_naming(_share_files_with_checksum_naming) { assert(share_table_files || !share_files_with_checksum); } }; @@ -309,7 +331,7 @@ class BackupEngineReadOnly { // // Returns Status::OK() if all checks are good virtual Status VerifyBackup(BackupID backup_id, - bool verify_with_checksum = true) = 0; + bool verify_with_checksum = false) = 0; }; // A backup engine for creating new backups. @@ -431,7 +453,7 @@ class BackupEngine { // // Returns Status::OK() if all checks are good virtual Status VerifyBackup(BackupID backup_id, - bool verify_with_checksum = true) = 0; + bool verify_with_checksum = false) = 0; // Will delete any files left over from incomplete creation or deletion of // a backup. This is not normally needed as those operations also clean up diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index b0d1a6204..aa5577c03 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -129,12 +129,14 @@ class BackupEngineImpl : public BackupEngine { } Status VerifyBackup(BackupID backup_id, - bool verify_with_checksum = true) override; + bool verify_with_checksum = false) override; Status Initialize(); - // Whether new naming for backup files is used or not - bool UseNewNaming() const { return options_.new_naming_for_backup_files; } + // Obtain the naming option for backup table files + BackupTableNameOption GetTableNamingOption() const { + return options_.share_files_with_checksum_naming; + } private: void DeleteChildren(const std::string& dir, uint32_t file_type_filter = 0); @@ -299,17 +301,19 @@ class BackupEngineImpl : public BackupEngine { return GetSharedChecksumDirRel() + "/" + (tmp ? "." : "") + file + (tmp ? ".tmp" : ""); } - // If UseNewNaming() && !db_session_id.empty(), backup SST filenames consist - // of file_number, crc32c, db_session_id. - // Otherwise, backup SST filenames consist of file_number, crc32c, file_size + // If kChecksumAndDbSessionId is the naming option and db_session_id is not + // empty, backup SST filenames consist of file_number, crc32c, db_session_id. + // Otherwise, backup SST filenames consist of file_number, crc32c, file_size. inline std::string GetSharedFileWithChecksum( const std::string& file, const uint32_t checksum_value, const uint64_t file_size, const std::string& db_session_id) const { assert(file.size() == 0 || file[0] != '/'); std::string file_copy = file; - const std::string suffix = UseNewNaming() && !db_session_id.empty() - ? db_session_id - : ROCKSDB_NAMESPACE::ToString(file_size); + const std::string suffix = + GetTableNamingOption() == kChecksumAndDbSessionId && + !db_session_id.empty() + ? db_session_id + : ROCKSDB_NAMESPACE::ToString(file_size); return file_copy.insert( file_copy.find_last_of('.'), "_" + ROCKSDB_NAMESPACE::ToString(checksum_value) + "_" + suffix); @@ -1457,7 +1461,7 @@ Status BackupEngineImpl::CopyOrCreateFile( if (s.ok()) { s = dest_writer->Close(); } - if (s.ok() && UseNewNaming()) { + if (s.ok() && GetTableNamingOption() == kChecksumAndDbSessionId) { // When copying SST files and using db_session_id in the name, // try to get DB identities // Note that when CopyOrCreateFile() is called while restoring, we still @@ -1503,7 +1507,7 @@ Status BackupEngineImpl::AddBackupFileWorkItem( if (!s.ok()) { return s; } - if (UseNewNaming()) { + if (GetTableNamingOption() == kChecksumAndDbSessionId) { // Prepare db_session_id to add to the file name // Ignore the returned status // In the failed cases, db_id and db_session_id will be empty @@ -1565,7 +1569,8 @@ Status BackupEngineImpl::AddBackupFileWorkItem( } else if (shared && (same_path || file_exists)) { need_to_copy = false; if (shared_checksum) { - if (UseNewNaming() && !db_session_id.empty()) { + if (GetTableNamingOption() == kChecksumAndDbSessionId && + !db_session_id.empty()) { ROCKS_LOG_INFO(options_.info_log, "%s already present, with checksum %u, size %" PRIu64 " and DB session identity %s", @@ -1598,7 +1603,7 @@ Status BackupEngineImpl::AddBackupFileWorkItem( } // try to get the db identities as they are also members of // the class CopyOrCreateResult - if (UseNewNaming()) { + if (GetTableNamingOption() == kChecksumAndDbSessionId) { assert(IsSstFile(fname)); ROCKS_LOG_INFO(options_.info_log, "%s checksum checksum calculated, try to obtain DB " @@ -1683,11 +1688,12 @@ Status BackupEngineImpl::GetFileDbIdentities(Env* src_env, const std::string& file_path, std::string* db_id, std::string* db_session_id) { - // Prepare the full_path of file_path under src_env for SstFileDumper + // // Prepare the full_path of file_path under src_env for SstFileDumper std::string full_path; src_env->GetAbsolutePath(file_path, &full_path); - - SstFileDumper sst_reader(Options(), full_path, + Options options; + options.env = src_env; + SstFileDumper sst_reader(options, full_path, 2 * 1024 * 1024 /* readahead_size */, false /* verify_checksum */, false /* output_hex */, @@ -2160,7 +2166,7 @@ class BackupEngineReadOnlyImpl : public BackupEngineReadOnly { } Status VerifyBackup(BackupID backup_id, - bool verify_with_checksum = true) override { + bool verify_with_checksum = false) override { return backup_engine_->VerifyBackup(backup_id, verify_with_checksum); } diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index cbb912e7d..4f9dc6050 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -1101,7 +1101,7 @@ TEST_F(BackupableDBTest, CorruptFileMaintainSize) { // file checksums mismatch ASSERT_NOK(backup_engine_->VerifyBackup(1, true)); // sanity check, use default second argument - ASSERT_NOK(backup_engine_->VerifyBackup(1)); + ASSERT_OK(backup_engine_->VerifyBackup(1)); CloseDBAndBackupEngine(); // an extra challenge @@ -1370,11 +1370,12 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) { } } -// Verify backup and restore with share_files_with_checksum and -// new_naming_for_backup_files on +// Verify backup and restore with share_files_with_checksum on and +// share_files_with_checksum_naming = kChecksumAndDbSessionId TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNaming) { // Use session id in the name of SST file backup - ASSERT_TRUE(backupable_options_->new_naming_for_backup_files); + ASSERT_TRUE(backupable_options_->share_files_with_checksum_naming == + kChecksumAndDbSessionId); const int keys_iteration = 5000; OpenDBAndBackupEngine(true, false, kShareWithChecksum); for (int i = 0; i < 5; ++i) { @@ -1390,13 +1391,15 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNaming) { } // Verify backup and restore with share_files_with_checksum off and then -// transition this option and new_naming_for_backup_files to be on +// transition this option and share_files_with_checksum_naming to be +// kChecksumAndDbSessionId TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingTransition) { const int keys_iteration = 5000; - // We may set new_naming_for_backup_files to false here - // but even if it is true, it should have no effect when + // We may set share_files_with_checksum_naming to kChecksumAndFileSize + // here but even if we don't, it should have no effect when // share_files_with_checksum is false - ASSERT_TRUE(backupable_options_->new_naming_for_backup_files); + ASSERT_TRUE(backupable_options_->share_files_with_checksum_naming == + kChecksumAndDbSessionId); // set share_files_with_checksum to false OpenDBAndBackupEngine(true, false, kShareNoChecksum); for (int i = 0; i < 5; ++i) { @@ -1412,7 +1415,8 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingTransition) { // set share_files_with_checksum to true and do some more backups // and use session id in the name of SST file backup - ASSERT_TRUE(backupable_options_->new_naming_for_backup_files); + ASSERT_TRUE(backupable_options_->share_files_with_checksum_naming == + kChecksumAndDbSessionId); OpenDBAndBackupEngine(false /* destroy_old_data */, false, kShareWithChecksum); for (int i = 5; i < 10; ++i) { @@ -1426,8 +1430,9 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingTransition) { // For an extra challenge, make sure that GarbageCollect / DeleteBackup // is OK even if we open without share_table_files but with - // new_naming_for_backup_files on - ASSERT_TRUE(backupable_options_->new_naming_for_backup_files); + // share_files_with_checksum_naming being kChecksumAndDbSessionId + ASSERT_TRUE(backupable_options_->share_files_with_checksum_naming == + kChecksumAndDbSessionId); OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare); backup_engine_->DeleteBackup(1); backup_engine_->GarbageCollect(); @@ -1436,9 +1441,10 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingTransition) { // Verify second (about to delete) AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 11); - // Turn off new_naming_for_backup_files and open without share_table_files + // Use checksum and file size for backup table file names and open without + // share_table_files // Again, make sure that GarbageCollect / DeleteBackup is OK - backupable_options_->new_naming_for_backup_files = false; + backupable_options_->share_files_with_checksum_naming = kChecksumAndFileSize; OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare); backup_engine_->DeleteBackup(2); backup_engine_->GarbageCollect(); @@ -1451,11 +1457,10 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingTransition) { } } -// Verify backup and restore with share_files_with_checksum on but -// new_naming_for_backup_files off, then transition new_naming_for_backup_files -// to be on +// Verify backup and restore with share_files_with_checksum on and transition +// from kChecksumAndFileSize to kChecksumAndDbSessionId TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingUpgrade) { - backupable_options_->new_naming_for_backup_files = false; + backupable_options_->share_files_with_checksum_naming = kChecksumAndFileSize; const int keys_iteration = 5000; // set share_files_with_checksum to true OpenDBAndBackupEngine(true, false, kShareWithChecksum); @@ -1470,8 +1475,8 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingUpgrade) { keys_iteration * 6); } - // set new_naming_for_backup_files to true and do some more backups - backupable_options_->new_naming_for_backup_files = true; + backupable_options_->share_files_with_checksum_naming = + kChecksumAndDbSessionId; OpenDBAndBackupEngine(false /* destroy_old_data */, false, kShareWithChecksum); for (int i = 5; i < 10; ++i) { @@ -1493,9 +1498,10 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingUpgrade) { // Verify second (about to delete) AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 11); - // Turn off new_naming_for_backup_files and open without share_table_files + // Use checksum and file size for backup table file names and open without + // share_table_files // Again, make sure that GarbageCollect / DeleteBackup is OK - backupable_options_->new_naming_for_backup_files = false; + backupable_options_->share_files_with_checksum_naming = kChecksumAndFileSize; OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare); backup_engine_->DeleteBackup(2); backup_engine_->GarbageCollect();