Provide an option so that SST ingestion won't fall back to copy after hard linking fails (#5333)

Summary: RocksDB always tries to perform a hard link operation on the external SST file to ingest. This operation can fail if the external SST resides on a different device/FS, or the underlying FS does not support hard link. Currently RocksDB assumes that if the link fails, the user is willing to perform file copy, which is not true according to the post. This commit provides an option named 'failed_move_fall_back_to_copy' for users to choose which behavior they want. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5333 Differential Revision: D15457597 Pulled By: HaoyuHuang fbshipit-source-id: f3626e13f845db4f7ed970a53ec8a2b1f0d62214
6 years ago · 74a334a2eb
parent 6a54278b4a
commit 74a334a2eb
4 changed files with 106 additions and 27 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -7,6 +7,7 @@
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
 * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees.
 * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.
+* Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error.

 ### Performance Improvements
 * Reduce binary search when iterator reseek into the same data block.
@ -20,7 +21,7 @@

 ### Bug Fixes

-	
+
 ## 6.2.0 (4/30/2019)
 ### New Features
 * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@ -92,26 +92,27 @@ Status ExternalSstFileIngestionJob::Prepare(
  // Copy/Move external files into DB
  for (IngestedFileInfo& f : files_to_ingest_) {
    f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
-
+    f.copy_file = false;
    const std::string path_outside_db = f.external_file_path;
    const std::string path_inside_db =
        TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(),
                      f.fd.GetPathId());
-
    if (ingestion_options_.move_files) {
      status = env_->LinkFile(path_outside_db, path_inside_db);
-      if (status.IsNotSupported()) {
-        // Original file is on a different FS, use copy instead of hard linking
-        status = CopyFile(env_, path_outside_db, path_inside_db, 0,
-                          db_options_.use_fsync);
+      if (status.IsNotSupported() &&
+          ingestion_options_.failed_move_fall_back_to_copy) {
+        // Original file is on a different FS, use copy instead of hard linking.
        f.copy_file = true;
-      } else {
-        f.copy_file = false;
      }
    } else {
+      f.copy_file = true;
+    }
+
+    if (f.copy_file) {
+      TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
+                               nullptr);
      status = CopyFile(env_, path_outside_db, path_inside_db, 0,
                        db_options_.use_fsync);
-      f.copy_file = true;
    }
    TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
    if (!status.ok()) {
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@ -16,6 +16,54 @@

 namespace rocksdb {

+// A test environment that can be configured to fail the Link operation.
+class ExternalSSTTestEnv : public EnvWrapper {
+ public:
+  ExternalSSTTestEnv(Env* t, bool fail_link)
+      : EnvWrapper(t), fail_link_(fail_link) {}
+
+  Status LinkFile(const std::string& s, const std::string& t) override {
+    if (fail_link_) {
+      return Status::NotSupported("Link failed");
+    }
+    return target()->LinkFile(s, t);
+  }
+
+  void set_fail_link(bool fail_link) { fail_link_ = fail_link; }
+
+ private:
+  bool fail_link_;
+};
+
+class ExternSSTFileLinkFailFallbackTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  ExternSSTFileLinkFailFallbackTest()
+      : DBTestBase("/external_sst_file_test"),
+        test_env_(new ExternalSSTTestEnv(env_, true)) {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    test::DestroyDir(env_, sst_files_dir_);
+    env_->CreateDir(sst_files_dir_);
+    options_ = CurrentOptions();
+    options_.disable_auto_compactions = true;
+    options_.env = test_env_;
+  }
+
+  void TearDown() override {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options_));
+    delete test_env_;
+    test_env_ = nullptr;
+  }
+
+ protected:
+  std::string sst_files_dir_;
+  Options options_;
+  ExternalSSTTestEnv* test_env_;
+};
+
 class ExternalSSTFileTest
    : public DBTestBase,
      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
@ -2014,17 +2062,23 @@ TEST_F(ExternalSSTFileTest, FileWithCFInfo) {
 }

 /*
- * Test and verify the functionality of ingestion_options.move_files.
+ * Test and verify the functionality of ingestion_options.move_files and
+ * ingestion_options.failed_move_fall_back_to_copy
 */
-TEST_F(ExternalSSTFileTest, LinkExternalSst) {
-  Options options = CurrentOptions();
-  options.disable_auto_compactions = true;
-  DestroyAndReopen(options);
+TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) {
+  const bool fail_link = std::get<0>(GetParam());
+  const bool failed_move_fall_back_to_copy = std::get<1>(GetParam());
+  test_env_->set_fail_link(fail_link);
+  const EnvOptions env_options;
+  DestroyAndReopen(options_);
  const int kNumKeys = 10000;
+  IngestExternalFileOptions ifo;
+  ifo.move_files = true;
+  ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy;

  std::string file_path = sst_files_dir_ + "file1.sst";
  // Create SstFileWriter for default column family
-  SstFileWriter sst_file_writer(EnvOptions(), options);
+  SstFileWriter sst_file_writer(env_options, options_);
  ASSERT_OK(sst_file_writer.Open(file_path));
  for (int i = 0; i < kNumKeys; i++) {
    ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value"));
@ -2033,9 +2087,13 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) {
  uint64_t file_size = 0;
  ASSERT_OK(env_->GetFileSize(file_path, &file_size));

-  IngestExternalFileOptions ifo;
-  ifo.move_files = true;
-  ASSERT_OK(db_->IngestExternalFile({file_path}, ifo));
+  bool copyfile = false;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:CopyFile",
+      [&](void* /* arg */) { copyfile = true; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  const Status s = db_->IngestExternalFile({file_path}, ifo);

  ColumnFamilyHandleImpl* cfh =
      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
@ -2049,18 +2107,29 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) {
    bytes_copied += stats.bytes_written;
    bytes_moved += stats.bytes_moved;
  }
-  // If bytes_moved > 0, it means external sst resides on the same FS
-  // supporting hard link operation. Therefore,
-  // 0 bytes should be copied, and the bytes_moved == file_size.
-  // Otherwise, FS does not support hard link, or external sst file resides on
-  // a different file system, then the bytes_copied should be equal to
-  // file_size.
-  if (bytes_moved > 0) {
+
+  if (!fail_link) {
+    // Link operation succeeds. External SST should be moved.
+    ASSERT_OK(s);
    ASSERT_EQ(0, bytes_copied);
    ASSERT_EQ(file_size, bytes_moved);
+    ASSERT_FALSE(copyfile);
  } else {
-    ASSERT_EQ(file_size, bytes_copied);
+    // Link operation fails.
+    ASSERT_EQ(0, bytes_moved);
+    if (failed_move_fall_back_to_copy) {
+      ASSERT_OK(s);
+      // Copy file is true since a failed link falls back to copy file.
+      ASSERT_TRUE(copyfile);
+      ASSERT_EQ(file_size, bytes_copied);
+    } else {
+      ASSERT_TRUE(s.IsNotSupported());
+      // Copy file is false since a failed link does not fall back to copy file.
+      ASSERT_FALSE(copyfile);
+      ASSERT_EQ(0, bytes_copied);
+    }
  }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }

 class TestIngestExternalFileListener : public EventListener {
@ -2666,6 +2735,12 @@ INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
                                        std::make_tuple(true, false),
                                        std::make_tuple(true, true)));

+INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
+                        ExternSSTFileLinkFailFallbackTest,
+                        testing::Values(std::make_tuple(true, false),
+                                        std::make_tuple(true, true),
+                                        std::make_tuple(false, false)));
+
 }  // namespace rocksdb

 int main(int argc, char** argv) {
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -1398,6 +1398,8 @@ struct CompactRangeOptions {
 struct IngestExternalFileOptions {
  // Can be set to true to move the files instead of copying them.
  bool move_files = false;
+  // If set to true, ingestion falls back to copy when move fails.
+  bool failed_move_fall_back_to_copy = true;
  // If set to false, an ingested file keys could appear in existing snapshots
  // that where created before the file was ingested.
  bool snapshot_consistency = true;