From 545e14b53be58a282eb3ae05fe627589f96d9de0 Mon Sep 17 00:00:00 2001 From: Zhichao Cao Date: Wed, 20 May 2020 11:53:49 -0700 Subject: [PATCH] Generate file checksum in SstFileWriter (#6859) Summary: If Option.file_checksum_gen_factory is set, rocksdb generates the file checksum during flush and compaction based on the checksum generator created by the factory and store the checksum and function name in vstorage and Manifest. This PR enable file checksum generation in SstFileWrite and store the checksum and checksum function name in the ExternalSstFileInfo, such that application can use them for other purpose, for example, ingest the file checksum with files in IngestExternalFile(). Pull Request resolved: https://github.com/facebook/rocksdb/pull/6859 Test Plan: add unit test and pass make asan_check. Reviewed By: ajkr Differential Revision: D21656247 Pulled By: zhichao-cao fbshipit-source-id: 78a3570c76031d8832e3d2de3d6c79cdf2b675d0 --- HISTORY.md | 1 + db/external_sst_file_basic_test.cc | 106 +++++++++++++++++++++++++++++ include/rocksdb/sst_file_writer.h | 6 ++ table/sst_file_writer.cc | 13 ++-- 4 files changed, 122 insertions(+), 4 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index ecb5f7948..d5d79301e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -18,6 +18,7 @@ ### New Feature * sst_dump to add a new --readahead_size argument. Users can specify read size when scanning the data. Sst_dump also tries to prefetch tail part of the SST files so usually some number of I/Os are saved there too. +* Generate file checksum in SstFileWriter if Options.file_checksum_gen_factory is set. The checksum and checksum function name are stored in ExternalSstFileInfo after the sst file write is finished. ## 6.10 (5/2/2020) ### Bug Fixes diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index 7f0ba213b..610faf5dd 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -6,6 +6,7 @@ #include #include "db/db_test_util.h" +#include "db/version_edit.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/sst_file_writer.h" @@ -174,6 +175,111 @@ TEST_F(ExternalSSTFileBasicTest, Basic) { ASSERT_EQ(file1_info.num_range_del_entries, 0); ASSERT_EQ(file1_info.smallest_range_del_key, ""); ASSERT_EQ(file1_info.largest_range_del_key, ""); + ASSERT_EQ(file1_info.file_checksum, kUnknownFileChecksum); + ASSERT_EQ(file1_info.file_checksum_func_name, kUnknownFileChecksumFuncName); + // sst_file_writer already finished, cannot add this value + s = sst_file_writer.Put(Key(100), "bad_val"); + ASSERT_FALSE(s.ok()) << s.ToString(); + s = sst_file_writer.DeleteRange(Key(100), Key(200)); + ASSERT_FALSE(s.ok()) << s.ToString(); + + DestroyAndReopen(options); + // Add file using file path + s = DeprecatedAddFile({file1}); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 100; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + DestroyAndRecreateExternalSSTFilesDir(); +} + +class ChecksumVerifyHelper { + private: + Options options_; + + public: + ChecksumVerifyHelper(Options& options) : options_(options) {} + ~ChecksumVerifyHelper() {} + + Status GetSingleFileChecksumAndFuncName( + const std::string& file_path, std::string* file_checksum, + std::string* file_checksum_func_name) { + Status s; + EnvOptions soptions; + std::unique_ptr file_reader; + s = options_.env->NewSequentialFile(file_path, &file_reader, soptions); + if (!s.ok()) { + return s; + } + std::unique_ptr scratch(new char[2048]); + Slice result; + FileChecksumGenFactory* file_checksum_gen_factory = + options_.file_checksum_gen_factory.get(); + if (file_checksum_gen_factory == nullptr) { + *file_checksum = kUnknownFileChecksum; + *file_checksum_func_name = kUnknownFileChecksumFuncName; + return Status::OK(); + } else { + FileChecksumGenContext gen_context; + std::unique_ptr file_checksum_gen = + file_checksum_gen_factory->CreateFileChecksumGenerator(gen_context); + *file_checksum_func_name = file_checksum_gen->Name(); + s = file_reader->Read(2048, &result, scratch.get()); + if (!s.ok()) { + return s; + } + while (result.size() != 0) { + file_checksum_gen->Update(scratch.get(), result.size()); + s = file_reader->Read(2048, &result, scratch.get()); + if (!s.ok()) { + return s; + } + } + file_checksum_gen->Finalize(); + *file_checksum = file_checksum_gen->GetChecksum(); + } + return Status::OK(); + } +}; + +TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) { + Options options = CurrentOptions(); + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + ChecksumVerifyHelper checksum_helper(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open a + // file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + // file1.sst (0 => 99) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + std::string file_checksum, file_checksum_func_name; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file1, &file_checksum, &file_checksum_func_name)); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + ASSERT_EQ(file1_info.num_range_del_entries, 0); + ASSERT_EQ(file1_info.smallest_range_del_key, ""); + ASSERT_EQ(file1_info.largest_range_del_key, ""); + ASSERT_EQ(file1_info.file_checksum, file_checksum); + ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name); // sst_file_writer already finished, cannot add this value s = sst_file_writer.Put(Key(100), "bad_val"); ASSERT_FALSE(s.ok()) << s.ToString(); diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h index e83383fea..c7a8203e1 100644 --- a/include/rocksdb/sst_file_writer.h +++ b/include/rocksdb/sst_file_writer.h @@ -34,6 +34,8 @@ struct ExternalSstFileInfo { largest_key(""), smallest_range_del_key(""), largest_range_del_key(""), + file_checksum(""), + file_checksum_func_name(""), sequence_number(0), file_size(0), num_entries(0), @@ -50,6 +52,8 @@ struct ExternalSstFileInfo { largest_key(_largest_key), smallest_range_del_key(""), largest_range_del_key(""), + file_checksum(""), + file_checksum_func_name(""), sequence_number(_sequence_number), file_size(_file_size), num_entries(_num_entries), @@ -62,6 +66,8 @@ struct ExternalSstFileInfo { std::string smallest_range_del_key; // smallest range deletion user key in file std::string largest_range_del_key; // largest range deletion user key in file + std::string file_checksum; // sst file checksum; + std::string file_checksum_func_name; // The name of file checksum function SequenceNumber sequence_number; // sequence number of all keys in file uint64_t file_size; // file size in bytes uint64_t num_entries; // number of entries in file diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index 2d621889f..2c54c46cf 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -243,10 +243,10 @@ Status SstFileWriter::Open(const std::string& file_path) { &int_tbl_prop_collector_factories, compression_type, sample_for_compression, compression_opts, r->skip_filters, r->column_family_name, unknown_level); - r->file_writer.reset( - new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(sst_file)), - file_path, r->env_options, r->ioptions.env, - nullptr /* stats */, r->ioptions.listeners)); + r->file_writer.reset(new WritableFileWriter( + NewLegacyWritableFileWrapper(std::move(sst_file)), file_path, + r->env_options, r->ioptions.env, nullptr /* stats */, + r->ioptions.listeners, r->ioptions.file_checksum_gen_factory)); // TODO(tec) : If table_factory is using compressed block cache, we will // be adding the external sst file blocks into it, which is wasteful. @@ -300,6 +300,11 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { s = r->file_writer->Close(); } } + if (s.ok()) { + r->file_info.file_checksum = r->file_writer->GetFileChecksum(); + r->file_info.file_checksum_func_name = + r->file_writer->GetFileChecksumFuncName(); + } if (!s.ok()) { r->ioptions.env->DeleteFile(r->file_info.file_path); }