Provide an option so that SST ingestion won't fall back to copy after hard linking fails (#5333)

Summary:
RocksDB always tries to perform a hard link operation on the external SST file to ingest. This operation can fail if the external SST resides on a different device/FS, or the underlying FS does not support hard link. Currently RocksDB assumes that if the link fails, the user is willing to perform file copy, which is not true according to the post. This commit provides an option named  'failed_move_fall_back_to_copy' for users to choose which behavior they want.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5333

Differential Revision: D15457597

Pulled By: HaoyuHuang

fbshipit-source-id: f3626e13f845db4f7ed970a53ec8a2b1f0d62214
main
haoyuhuang 6 years ago committed by Facebook Github Bot
parent 6a54278b4a
commit 74a334a2eb
  1. 1
      HISTORY.md
  2. 19
      db/external_sst_file_ingestion_job.cc
  3. 109
      db/external_sst_file_test.cc
  4. 2
      include/rocksdb/options.h

@ -7,6 +7,7 @@
* Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
* Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees. * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees.
* Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL. * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.
* Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error.
### Performance Improvements ### Performance Improvements
* Reduce binary search when iterator reseek into the same data block. * Reduce binary search when iterator reseek into the same data block.

@ -92,26 +92,27 @@ Status ExternalSstFileIngestionJob::Prepare(
// Copy/Move external files into DB // Copy/Move external files into DB
for (IngestedFileInfo& f : files_to_ingest_) { for (IngestedFileInfo& f : files_to_ingest_) {
f.fd = FileDescriptor(next_file_number++, 0, f.file_size); f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
f.copy_file = false;
const std::string path_outside_db = f.external_file_path; const std::string path_outside_db = f.external_file_path;
const std::string path_inside_db = const std::string path_inside_db =
TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(), TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(),
f.fd.GetPathId()); f.fd.GetPathId());
if (ingestion_options_.move_files) { if (ingestion_options_.move_files) {
status = env_->LinkFile(path_outside_db, path_inside_db); status = env_->LinkFile(path_outside_db, path_inside_db);
if (status.IsNotSupported()) { if (status.IsNotSupported() &&
// Original file is on a different FS, use copy instead of hard linking ingestion_options_.failed_move_fall_back_to_copy) {
status = CopyFile(env_, path_outside_db, path_inside_db, 0, // Original file is on a different FS, use copy instead of hard linking.
db_options_.use_fsync);
f.copy_file = true; f.copy_file = true;
} else {
f.copy_file = false;
} }
} else { } else {
f.copy_file = true;
}
if (f.copy_file) {
TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
nullptr);
status = CopyFile(env_, path_outside_db, path_inside_db, 0, status = CopyFile(env_, path_outside_db, path_inside_db, 0,
db_options_.use_fsync); db_options_.use_fsync);
f.copy_file = true;
} }
TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded"); TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
if (!status.ok()) { if (!status.ok()) {

@ -16,6 +16,54 @@
namespace rocksdb { namespace rocksdb {
// A test environment that can be configured to fail the Link operation.
class ExternalSSTTestEnv : public EnvWrapper {
public:
ExternalSSTTestEnv(Env* t, bool fail_link)
: EnvWrapper(t), fail_link_(fail_link) {}
Status LinkFile(const std::string& s, const std::string& t) override {
if (fail_link_) {
return Status::NotSupported("Link failed");
}
return target()->LinkFile(s, t);
}
void set_fail_link(bool fail_link) { fail_link_ = fail_link; }
private:
bool fail_link_;
};
class ExternSSTFileLinkFailFallbackTest
: public DBTestBase,
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
public:
ExternSSTFileLinkFailFallbackTest()
: DBTestBase("/external_sst_file_test"),
test_env_(new ExternalSSTTestEnv(env_, true)) {
sst_files_dir_ = dbname_ + "/sst_files/";
test::DestroyDir(env_, sst_files_dir_);
env_->CreateDir(sst_files_dir_);
options_ = CurrentOptions();
options_.disable_auto_compactions = true;
options_.env = test_env_;
}
void TearDown() override {
delete db_;
db_ = nullptr;
ASSERT_OK(DestroyDB(dbname_, options_));
delete test_env_;
test_env_ = nullptr;
}
protected:
std::string sst_files_dir_;
Options options_;
ExternalSSTTestEnv* test_env_;
};
class ExternalSSTFileTest class ExternalSSTFileTest
: public DBTestBase, : public DBTestBase,
public ::testing::WithParamInterface<std::tuple<bool, bool>> { public ::testing::WithParamInterface<std::tuple<bool, bool>> {
@ -2014,17 +2062,23 @@ TEST_F(ExternalSSTFileTest, FileWithCFInfo) {
} }
/* /*
* Test and verify the functionality of ingestion_options.move_files. * Test and verify the functionality of ingestion_options.move_files and
* ingestion_options.failed_move_fall_back_to_copy
*/ */
TEST_F(ExternalSSTFileTest, LinkExternalSst) { TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) {
Options options = CurrentOptions(); const bool fail_link = std::get<0>(GetParam());
options.disable_auto_compactions = true; const bool failed_move_fall_back_to_copy = std::get<1>(GetParam());
DestroyAndReopen(options); test_env_->set_fail_link(fail_link);
const EnvOptions env_options;
DestroyAndReopen(options_);
const int kNumKeys = 10000; const int kNumKeys = 10000;
IngestExternalFileOptions ifo;
ifo.move_files = true;
ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy;
std::string file_path = sst_files_dir_ + "file1.sst"; std::string file_path = sst_files_dir_ + "file1.sst";
// Create SstFileWriter for default column family // Create SstFileWriter for default column family
SstFileWriter sst_file_writer(EnvOptions(), options); SstFileWriter sst_file_writer(env_options, options_);
ASSERT_OK(sst_file_writer.Open(file_path)); ASSERT_OK(sst_file_writer.Open(file_path));
for (int i = 0; i < kNumKeys; i++) { for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value")); ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value"));
@ -2033,9 +2087,13 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) {
uint64_t file_size = 0; uint64_t file_size = 0;
ASSERT_OK(env_->GetFileSize(file_path, &file_size)); ASSERT_OK(env_->GetFileSize(file_path, &file_size));
IngestExternalFileOptions ifo; bool copyfile = false;
ifo.move_files = true; rocksdb::SyncPoint::GetInstance()->SetCallBack(
ASSERT_OK(db_->IngestExternalFile({file_path}, ifo)); "ExternalSstFileIngestionJob::Prepare:CopyFile",
[&](void* /* arg */) { copyfile = true; });
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
const Status s = db_->IngestExternalFile({file_path}, ifo);
ColumnFamilyHandleImpl* cfh = ColumnFamilyHandleImpl* cfh =
static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily()); static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
@ -2049,18 +2107,29 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) {
bytes_copied += stats.bytes_written; bytes_copied += stats.bytes_written;
bytes_moved += stats.bytes_moved; bytes_moved += stats.bytes_moved;
} }
// If bytes_moved > 0, it means external sst resides on the same FS
// supporting hard link operation. Therefore, if (!fail_link) {
// 0 bytes should be copied, and the bytes_moved == file_size. // Link operation succeeds. External SST should be moved.
// Otherwise, FS does not support hard link, or external sst file resides on ASSERT_OK(s);
// a different file system, then the bytes_copied should be equal to
// file_size.
if (bytes_moved > 0) {
ASSERT_EQ(0, bytes_copied); ASSERT_EQ(0, bytes_copied);
ASSERT_EQ(file_size, bytes_moved); ASSERT_EQ(file_size, bytes_moved);
ASSERT_FALSE(copyfile);
} else { } else {
ASSERT_EQ(file_size, bytes_copied); // Link operation fails.
ASSERT_EQ(0, bytes_moved);
if (failed_move_fall_back_to_copy) {
ASSERT_OK(s);
// Copy file is true since a failed link falls back to copy file.
ASSERT_TRUE(copyfile);
ASSERT_EQ(file_size, bytes_copied);
} else {
ASSERT_TRUE(s.IsNotSupported());
// Copy file is false since a failed link does not fall back to copy file.
ASSERT_FALSE(copyfile);
ASSERT_EQ(0, bytes_copied);
}
} }
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
} }
class TestIngestExternalFileListener : public EventListener { class TestIngestExternalFileListener : public EventListener {
@ -2666,6 +2735,12 @@ INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
std::make_tuple(true, false), std::make_tuple(true, false),
std::make_tuple(true, true))); std::make_tuple(true, true)));
INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
ExternSSTFileLinkFailFallbackTest,
testing::Values(std::make_tuple(true, false),
std::make_tuple(true, true),
std::make_tuple(false, false)));
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -1398,6 +1398,8 @@ struct CompactRangeOptions {
struct IngestExternalFileOptions { struct IngestExternalFileOptions {
// Can be set to true to move the files instead of copying them. // Can be set to true to move the files instead of copying them.
bool move_files = false; bool move_files = false;
// If set to true, ingestion falls back to copy when move fails.
bool failed_move_fall_back_to_copy = true;
// If set to false, an ingested file keys could appear in existing snapshots // If set to false, an ingested file keys could appear in existing snapshots
// that where created before the file was ingested. // that where created before the file was ingested.
bool snapshot_consistency = true; bool snapshot_consistency = true;

Loading…
Cancel
Save