diff --git a/HISTORY.md b/HISTORY.md index b65f5a038..40d11096d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,6 +7,7 @@ * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees. * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL. +* Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error. ### Performance Improvements * Reduce binary search when iterator reseek into the same data block. @@ -20,7 +21,7 @@ ### Bug Fixes - + ## 6.2.0 (4/30/2019) ### New Features * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`. diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 28b481678..588ac5110 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -92,26 +92,27 @@ Status ExternalSstFileIngestionJob::Prepare( // Copy/Move external files into DB for (IngestedFileInfo& f : files_to_ingest_) { f.fd = FileDescriptor(next_file_number++, 0, f.file_size); - + f.copy_file = false; const std::string path_outside_db = f.external_file_path; const std::string path_inside_db = TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); - if (ingestion_options_.move_files) { status = env_->LinkFile(path_outside_db, path_inside_db); - if (status.IsNotSupported()) { - // Original file is on a different FS, use copy instead of hard linking - status = CopyFile(env_, path_outside_db, path_inside_db, 0, - db_options_.use_fsync); + if (status.IsNotSupported() && + ingestion_options_.failed_move_fall_back_to_copy) { + // Original file is on a different FS, use copy instead of hard linking. f.copy_file = true; - } else { - f.copy_file = false; } } else { + f.copy_file = true; + } + + if (f.copy_file) { + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile", + nullptr); status = CopyFile(env_, path_outside_db, path_inside_db, 0, db_options_.use_fsync); - f.copy_file = true; } TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded"); if (!status.ok()) { diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index cbbb2fa26..3850a2a03 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -16,6 +16,54 @@ namespace rocksdb { +// A test environment that can be configured to fail the Link operation. +class ExternalSSTTestEnv : public EnvWrapper { + public: + ExternalSSTTestEnv(Env* t, bool fail_link) + : EnvWrapper(t), fail_link_(fail_link) {} + + Status LinkFile(const std::string& s, const std::string& t) override { + if (fail_link_) { + return Status::NotSupported("Link failed"); + } + return target()->LinkFile(s, t); + } + + void set_fail_link(bool fail_link) { fail_link_ = fail_link; } + + private: + bool fail_link_; +}; + +class ExternSSTFileLinkFailFallbackTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + ExternSSTFileLinkFailFallbackTest() + : DBTestBase("/external_sst_file_test"), + test_env_(new ExternalSSTTestEnv(env_, true)) { + sst_files_dir_ = dbname_ + "/sst_files/"; + test::DestroyDir(env_, sst_files_dir_); + env_->CreateDir(sst_files_dir_); + options_ = CurrentOptions(); + options_.disable_auto_compactions = true; + options_.env = test_env_; + } + + void TearDown() override { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options_)); + delete test_env_; + test_env_ = nullptr; + } + + protected: + std::string sst_files_dir_; + Options options_; + ExternalSSTTestEnv* test_env_; +}; + class ExternalSSTFileTest : public DBTestBase, public ::testing::WithParamInterface> { @@ -2014,17 +2062,23 @@ TEST_F(ExternalSSTFileTest, FileWithCFInfo) { } /* - * Test and verify the functionality of ingestion_options.move_files. + * Test and verify the functionality of ingestion_options.move_files and + * ingestion_options.failed_move_fall_back_to_copy */ -TEST_F(ExternalSSTFileTest, LinkExternalSst) { - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - DestroyAndReopen(options); +TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) { + const bool fail_link = std::get<0>(GetParam()); + const bool failed_move_fall_back_to_copy = std::get<1>(GetParam()); + test_env_->set_fail_link(fail_link); + const EnvOptions env_options; + DestroyAndReopen(options_); const int kNumKeys = 10000; + IngestExternalFileOptions ifo; + ifo.move_files = true; + ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy; std::string file_path = sst_files_dir_ + "file1.sst"; // Create SstFileWriter for default column family - SstFileWriter sst_file_writer(EnvOptions(), options); + SstFileWriter sst_file_writer(env_options, options_); ASSERT_OK(sst_file_writer.Open(file_path)); for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value")); @@ -2033,9 +2087,13 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) { uint64_t file_size = 0; ASSERT_OK(env_->GetFileSize(file_path, &file_size)); - IngestExternalFileOptions ifo; - ifo.move_files = true; - ASSERT_OK(db_->IngestExternalFile({file_path}, ifo)); + bool copyfile = false; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:CopyFile", + [&](void* /* arg */) { copyfile = true; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + const Status s = db_->IngestExternalFile({file_path}, ifo); ColumnFamilyHandleImpl* cfh = static_cast(dbfull()->DefaultColumnFamily()); @@ -2049,18 +2107,29 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) { bytes_copied += stats.bytes_written; bytes_moved += stats.bytes_moved; } - // If bytes_moved > 0, it means external sst resides on the same FS - // supporting hard link operation. Therefore, - // 0 bytes should be copied, and the bytes_moved == file_size. - // Otherwise, FS does not support hard link, or external sst file resides on - // a different file system, then the bytes_copied should be equal to - // file_size. - if (bytes_moved > 0) { + + if (!fail_link) { + // Link operation succeeds. External SST should be moved. + ASSERT_OK(s); ASSERT_EQ(0, bytes_copied); ASSERT_EQ(file_size, bytes_moved); + ASSERT_FALSE(copyfile); } else { - ASSERT_EQ(file_size, bytes_copied); + // Link operation fails. + ASSERT_EQ(0, bytes_moved); + if (failed_move_fall_back_to_copy) { + ASSERT_OK(s); + // Copy file is true since a failed link falls back to copy file. + ASSERT_TRUE(copyfile); + ASSERT_EQ(file_size, bytes_copied); + } else { + ASSERT_TRUE(s.IsNotSupported()); + // Copy file is false since a failed link does not fall back to copy file. + ASSERT_FALSE(copyfile); + ASSERT_EQ(0, bytes_copied); + } } + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } class TestIngestExternalFileListener : public EventListener { @@ -2666,6 +2735,12 @@ INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest, std::make_tuple(true, false), std::make_tuple(true, true))); +INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest, + ExternSSTFileLinkFailFallbackTest, + testing::Values(std::make_tuple(true, false), + std::make_tuple(true, true), + std::make_tuple(false, false))); + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 7d22fb675..cc7119410 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1398,6 +1398,8 @@ struct CompactRangeOptions { struct IngestExternalFileOptions { // Can be set to true to move the files instead of copying them. bool move_files = false; + // If set to true, ingestion falls back to copy when move fails. + bool failed_move_fall_back_to_copy = true; // If set to false, an ingested file keys could appear in existing snapshots // that where created before the file was ingested. bool snapshot_consistency = true;