Clean up some code related to file checksums (#6861)

Summary:
* Add missing unit test for schema stability of FileChecksumGenCrc32c
  (previously was only comparing to itself)
* A lot of clarifying comments
* Add some assertions for preconditions
* Rename WritableFileWriter::CalculateFileChecksum -> UpdateFileChecksum
* Simplify FileChecksumGenCrc32c with shared functions
* Implement EndianSwapValue to replace unused EndianTransform

And incidentally since I had trouble with 'make check-format' GitHub action disagreeing with local run,
* Output full diagnostic information when 'make check-format' fails in CI
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6861

Test Plan: new unit test passes before & after other changes

Reviewed By: zhichao-cao

Differential Revision: D21667115

Pulled By: pdillinger

fbshipit-source-id: 6a99970f87605aa024fa540c78cd519ff322c3e6
main
Peter Dillinger 4 years ago committed by Facebook GitHub Bot
parent eb04bb86c6
commit c7aedf1b48
  1. 2
      .github/workflows/sanity_check.yml
  2. 4
      build_tools/format-diff.sh
  3. 5
      file/writable_file_writer.cc
  4. 2
      file/writable_file_writer.h
  5. 13
      include/rocksdb/file_checksum.h
  6. 2
      include/rocksdb/utilities/backupable_db.h
  7. 30
      table/table_test.cc
  8. 38
      util/coding.h
  9. 62
      util/file_checksum_helper.h
  10. 10
      utilities/backupable/backupable_db.cc

@ -35,7 +35,7 @@ jobs:
args: https://raw.githubusercontent.com/llvm-mirror/clang/master/tools/clang-format/clang-format-diff.py args: https://raw.githubusercontent.com/llvm-mirror/clang/master/tools/clang-format/clang-format-diff.py
- name: Check format - name: Check format
run: make check-format run: VERBOSE_CHECK=1 make check-format
- name: Compare buckify output - name: Compare buckify output
run: make check-buck-targets run: make check-buck-targets

@ -124,6 +124,10 @@ then
elif [ $CHECK_ONLY ] elif [ $CHECK_ONLY ]
then then
echo "Your change has unformatted code. Please run make format!" echo "Your change has unformatted code. Please run make format!"
if [ $VERBOSE_CHECK ]; then
clang-format --version
echo "$diffs"
fi
exit 1 exit 1
fi fi

@ -31,7 +31,7 @@ IOStatus WritableFileWriter::Append(const Slice& data) {
rocksdb_kill_odds * REDUCE_ODDS2); rocksdb_kill_odds * REDUCE_ODDS2);
// Calculate the checksum of appended data // Calculate the checksum of appended data
CalculateFileChecksum(data); UpdateFileChecksum(data);
{ {
IOSTATS_TIMER_GUARD(prepare_write_nanos); IOSTATS_TIMER_GUARD(prepare_write_nanos);
@ -225,6 +225,7 @@ IOStatus WritableFileWriter::Flush() {
std::string WritableFileWriter::GetFileChecksum() { std::string WritableFileWriter::GetFileChecksum() {
if (checksum_generator_ != nullptr) { if (checksum_generator_ != nullptr) {
assert(checksum_finalized_);
return checksum_generator_->GetChecksum(); return checksum_generator_->GetChecksum();
} else { } else {
return kUnknownFileChecksum; return kUnknownFileChecksum;
@ -346,7 +347,7 @@ IOStatus WritableFileWriter::WriteBuffered(const char* data, size_t size) {
return s; return s;
} }
void WritableFileWriter::CalculateFileChecksum(const Slice& data) { void WritableFileWriter::UpdateFileChecksum(const Slice& data) {
if (checksum_generator_ != nullptr) { if (checksum_generator_ != nullptr) {
checksum_generator_->Update(data.data(), data.size()); checksum_generator_->Update(data.data(), data.size());
} }

@ -51,7 +51,7 @@ class WritableFileWriter {
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
bool ShouldNotifyListeners() const { return !listeners_.empty(); } bool ShouldNotifyListeners() const { return !listeners_.empty(); }
void CalculateFileChecksum(const Slice& data); void UpdateFileChecksum(const Slice& data);
std::unique_ptr<FSWritableFile> writable_file_; std::unique_ptr<FSWritableFile> writable_file_;
std::string file_name_; std::string file_name_;

@ -24,6 +24,10 @@ struct FileChecksumGenContext {
// FileChecksumGenerator is the class to generates the checksum value // FileChecksumGenerator is the class to generates the checksum value
// for each file when the file is written to the file system. // for each file when the file is written to the file system.
// Implementations may assume that
// * Finalize is called at most once during the life of the object
// * All calls to Update come before Finalize
// * All calls to GetChecksum come after Finalize
class FileChecksumGenerator { class FileChecksumGenerator {
public: public:
virtual ~FileChecksumGenerator() {} virtual ~FileChecksumGenerator() {}
@ -36,7 +40,8 @@ class FileChecksumGenerator {
// Generate the final results if no further new data will be updated. // Generate the final results if no further new data will be updated.
virtual void Finalize() = 0; virtual void Finalize() = 0;
// Get the checksum // Get the checksum. The result should not be the empty string and may
// include arbitrary bytes, including non-printable characters.
virtual std::string GetChecksum() const = 0; virtual std::string GetChecksum() const = 0;
// Returns a name that identifies the current file checksum function. // Returns a name that identifies the current file checksum function.
@ -97,9 +102,13 @@ class FileChecksumList {
// Create a new file checksum list. // Create a new file checksum list.
extern FileChecksumList* NewFileChecksumList(); extern FileChecksumList* NewFileChecksumList();
// Return a shared_ptr of the builtin Crc32 based file checksum generatory // Return a shared_ptr of the builtin Crc32c based file checksum generatory
// factory object, which can be shared to create the Crc32c based checksum // factory object, which can be shared to create the Crc32c based checksum
// generator object. // generator object.
// Note: this implementation is compatible with many other crc32c checksum
// implementations and uses big-endian encoding of the result, unlike most
// other crc32c checksums in RocksDB, which alter the result with
// crc32c::Mask and use little-endian encoding.
extern std::shared_ptr<FileChecksumGenFactory> extern std::shared_ptr<FileChecksumGenFactory>
GetFileChecksumGenCrc32cFactory(); GetFileChecksumGenCrc32cFactory();

@ -89,7 +89,7 @@ struct BackupableDBOptions {
// Only used if share_table_files is set to true. If true, will consider that // Only used if share_table_files is set to true. If true, will consider that
// backups can come from different databases, hence a sst is not uniquely // backups can come from different databases, hence a sst is not uniquely
// identifed by its name, but by the triple (file name, crc32, file length) // identifed by its name, but by the triple (file name, crc32c, file length)
// Default: false // Default: false
// Note: this is an experimental option, and you'll need to set it manually // Note: this is an experimental option, and you'll need to set it manually
// *turn it on only if you know what you're doing* // *turn it on only if you know what you're doing*

@ -3297,7 +3297,7 @@ TEST_P(BlockBasedTableTest, NoFileChecksum) {
ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum.c_str()); ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum.c_str());
} }
TEST_P(BlockBasedTableTest, Crc32FileChecksum) { TEST_P(BlockBasedTableTest, Crc32cFileChecksum) {
FileChecksumGenCrc32cFactory* file_checksum_gen_factory = FileChecksumGenCrc32cFactory* file_checksum_gen_factory =
new FileChecksumGenCrc32cFactory(); new FileChecksumGenCrc32cFactory();
Options options; Options options;
@ -3322,12 +3322,12 @@ TEST_P(BlockBasedTableTest, Crc32FileChecksum) {
FileChecksumGenContext gen_context; FileChecksumGenContext gen_context;
gen_context.file_name = "db/tmp"; gen_context.file_name = "db/tmp";
std::unique_ptr<FileChecksumGenerator> checksum_crc32_gen1 = std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
options.file_checksum_gen_factory->CreateFileChecksumGenerator( options.file_checksum_gen_factory->CreateFileChecksumGenerator(
gen_context); gen_context);
FileChecksumTestHelper f(true); FileChecksumTestHelper f(true);
f.CreateWriteableFile(); f.CreateWriteableFile();
f.SetFileChecksumGenerator(checksum_crc32_gen1.release()); f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
std::unique_ptr<TableBuilder> builder; std::unique_ptr<TableBuilder> builder;
builder.reset(ioptions.table_factory->NewTableBuilder( builder.reset(ioptions.table_factory->NewTableBuilder(
TableBuilderOptions(ioptions, moptions, *comparator, TableBuilderOptions(ioptions, moptions, *comparator,
@ -3342,12 +3342,22 @@ TEST_P(BlockBasedTableTest, Crc32FileChecksum) {
f.WriteKVAndFlushTable(); f.WriteKVAndFlushTable();
ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c"); ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
std::unique_ptr<FileChecksumGenerator> checksum_crc32_gen2 = std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
options.file_checksum_gen_factory->CreateFileChecksumGenerator( options.file_checksum_gen_factory->CreateFileChecksumGenerator(
gen_context); gen_context);
std::string checksum; std::string checksum;
ASSERT_OK(f.CalculateFileChecksum(checksum_crc32_gen2.get(), &checksum)); ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
ASSERT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str()); ASSERT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
// Unit test the generator itself for schema stability
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen3 =
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
gen_context);
const char data[] = "here is some data";
checksum_crc32c_gen3->Update(data, sizeof(data));
checksum_crc32c_gen3->Finalize();
checksum = checksum_crc32c_gen3->GetChecksum();
ASSERT_STREQ(checksum.c_str(), "\345\245\277\110");
} }
// Plain table is not supported in ROCKSDB_LITE // Plain table is not supported in ROCKSDB_LITE
@ -3441,7 +3451,7 @@ TEST_F(PlainTableTest, NoFileChecksum) {
EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum.c_str()); EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum.c_str());
} }
TEST_F(PlainTableTest, Crc32FileChecksum) { TEST_F(PlainTableTest, Crc32cFileChecksum) {
PlainTableOptions plain_table_options; PlainTableOptions plain_table_options;
plain_table_options.user_key_len = 20; plain_table_options.user_key_len = 20;
plain_table_options.bloom_bits_per_key = 8; plain_table_options.bloom_bits_per_key = 8;
@ -3462,12 +3472,12 @@ TEST_F(PlainTableTest, Crc32FileChecksum) {
FileChecksumGenContext gen_context; FileChecksumGenContext gen_context;
gen_context.file_name = "db/tmp"; gen_context.file_name = "db/tmp";
std::unique_ptr<FileChecksumGenerator> checksum_crc32_gen1 = std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
options.file_checksum_gen_factory->CreateFileChecksumGenerator( options.file_checksum_gen_factory->CreateFileChecksumGenerator(
gen_context); gen_context);
FileChecksumTestHelper f(true); FileChecksumTestHelper f(true);
f.CreateWriteableFile(); f.CreateWriteableFile();
f.SetFileChecksumGenerator(checksum_crc32_gen1.release()); f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder( std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
TableBuilderOptions( TableBuilderOptions(
@ -3481,11 +3491,11 @@ TEST_F(PlainTableTest, Crc32FileChecksum) {
f.WriteKVAndFlushTable(); f.WriteKVAndFlushTable();
ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c"); ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
std::unique_ptr<FileChecksumGenerator> checksum_crc32_gen2 = std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
options.file_checksum_gen_factory->CreateFileChecksumGenerator( options.file_checksum_gen_factory->CreateFileChecksumGenerator(
gen_context); gen_context);
std::string checksum; std::string checksum;
ASSERT_OK(f.CalculateFileChecksum(checksum_crc32_gen2.get(), &checksum)); ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
EXPECT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str()); EXPECT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
} }

@ -7,8 +7,9 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
// //
// Endian-neutral encoding: // Encoding independent of machine byte order:
// * Fixed-length numbers are encoded with least-significant byte first // * Fixed-length numbers are encoded with least-significant byte first
// (little endian, native order on Intel and others)
// * In addition we support variable length "varint" encoding // * In addition we support variable length "varint" encoding
// * Strings are encoded prefixed by their length in varint format // * Strings are encoded prefixed by their length in varint format
@ -402,13 +403,34 @@ inline bool GetVarsignedint64(Slice* input, int64_t* value) {
} }
} }
// Provide an interface for platform independent endianness transformation // Swaps between big and little endian. Can be used to in combination
inline uint64_t EndianTransform(uint64_t input, size_t size) { // with the little-endian encoding/decoding functions to encode/decode
char* pos = reinterpret_cast<char*>(&input); // big endian.
uint64_t ret_val = 0; template <typename T>
for (size_t i = 0; i < size; ++i) { inline T EndianSwapValue(T v) {
ret_val |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i])) static_assert(std::is_integral<T>::value, "non-integral type");
<< ((size - i - 1) << 3));
#ifdef _MSC_VER
if (sizeof(T) == 2) {
return static_cast<T>(_byteswap_ushort(static_cast<uint16_t>(v)));
} else if (sizeof(T) == 4) {
return static_cast<T>(_byteswap_ulong(static_cast<uint32_t>(v)));
} else if (sizeof(T) == 8) {
return static_cast<T>(_byteswap_uint64(static_cast<uint64_t>(v)));
}
#else
if (sizeof(T) == 2) {
return static_cast<T>(__builtin_bswap16(static_cast<uint16_t>(v)));
} else if (sizeof(T) == 4) {
return static_cast<T>(__builtin_bswap32(static_cast<uint32_t>(v)));
} else if (sizeof(T) == 8) {
return static_cast<T>(__builtin_bswap64(static_cast<uint64_t>(v)));
}
#endif
// Recognized by clang as bswap, but not by gcc :(
T ret_val = 0;
for (size_t i = 0; i < sizeof(T); ++i) {
ret_val |= ((v >> (8 * i)) & 0xff) << (8 * (sizeof(T) - 1 - i));
} }
return ret_val; return ret_val;
} }

@ -6,11 +6,12 @@
#pragma once #pragma once
#include <cassert> #include <cassert>
#include <unordered_map> #include <unordered_map>
#include "port/port.h" #include "port/port.h"
#include "rocksdb/file_checksum.h" #include "rocksdb/file_checksum.h"
#include "rocksdb/status.h" #include "rocksdb/status.h"
#include "util/coding.h"
#include "util/crc32c.h" #include "util/crc32c.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
@ -26,60 +27,19 @@ class FileChecksumGenCrc32c : public FileChecksumGenerator {
checksum_ = crc32c::Extend(checksum_, data, n); checksum_ = crc32c::Extend(checksum_, data, n);
} }
void Finalize() override { checksum_str_ = Uint32ToString(checksum_); } void Finalize() override {
assert(checksum_str_.empty());
std::string GetChecksum() const override { return checksum_str_; } // Store as big endian raw bytes
PutFixed32(&checksum_str_, EndianSwapValue(checksum_));
const char* Name() const override { return "FileChecksumCrc32c"; }
// Convert a uint32_t type data into a 4 bytes string.
static std::string Uint32ToString(uint32_t v) {
std::string s;
if (port::kLittleEndian) {
s.append(reinterpret_cast<char*>(&v), sizeof(v));
} else {
char buf[sizeof(v)];
buf[0] = v & 0xff;
buf[1] = (v >> 8) & 0xff;
buf[2] = (v >> 16) & 0xff;
buf[3] = (v >> 24) & 0xff;
s.append(buf, sizeof(v));
}
size_t i = 0, j = s.size() - 1;
while (i < j) {
char tmp = s[i];
s[i] = s[j];
s[j] = tmp;
++i;
--j;
}
return s;
} }
// Convert a 4 bytes size string into a uint32_t type data. std::string GetChecksum() const override {
static uint32_t StringToUint32(std::string s) { assert(!checksum_str_.empty());
assert(s.size() == sizeof(uint32_t)); return checksum_str_;
size_t i = 0, j = s.size() - 1;
while (i < j) {
char tmp = s[i];
s[i] = s[j];
s[j] = tmp;
++i;
--j;
}
uint32_t v = 0;
if (port::kLittleEndian) {
memcpy(&v, s.c_str(), sizeof(uint32_t));
} else {
const char* buf = s.c_str();
v |= static_cast<uint32_t>(buf[0]);
v |= (static_cast<uint32_t>(buf[1]) << 8);
v |= (static_cast<uint32_t>(buf[2]) << 16);
v |= (static_cast<uint32_t>(buf[3]) << 24);
}
return v;
} }
const char* Name() const override { return "FileChecksumCrc32c"; }
private: private:
uint32_t checksum_; uint32_t checksum_;
std::string checksum_str_; std::string checksum_str_;

@ -1197,7 +1197,7 @@ Status BackupEngineImpl::RestoreDBFromBackup(const RestoreOptions& options,
std::string dst; std::string dst;
// 1. extract the filename // 1. extract the filename
size_t slash = file.find_last_of('/'); size_t slash = file.find_last_of('/');
// file will either be shared/<file>, shared_checksum/<file_crc32_size> // file will either be shared/<file>, shared_checksum/<file_crc32c_size>
// or private/<number>/<file> // or private/<number>/<file>
assert(slash != std::string::npos); assert(slash != std::string::npos);
dst = file.substr(slash + 1); dst = file.substr(slash + 1);
@ -1759,8 +1759,8 @@ Slice kMetaDataPrefix("metadata ");
// <seq number> // <seq number>
// <metadata(literal string)> <metadata> (optional) // <metadata(literal string)> <metadata> (optional)
// <number of files> // <number of files>
// <file1> <crc32(literal string)> <crc32_value> // <file1> <crc32(literal string)> <crc32c_value>
// <file2> <crc32(literal string)> <crc32_value> // <file2> <crc32(literal string)> <crc32c_value>
// ... // ...
Status BackupEngineImpl::BackupMeta::LoadFromFile( Status BackupEngineImpl::BackupMeta::LoadFromFile(
const std::string& backup_dir, const std::string& backup_dir,
@ -1808,6 +1808,7 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
std::vector<std::shared_ptr<FileInfo>> files; std::vector<std::shared_ptr<FileInfo>> files;
// WART: The checksums are crc32c, not original crc32
Slice checksum_prefix("crc32 "); Slice checksum_prefix("crc32 ");
for (uint32_t i = 0; s.ok() && i < num_files; ++i) { for (uint32_t i = 0; s.ok() && i < num_files; ++i) {
@ -1921,7 +1922,8 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
} }
for (const auto& file : files_) { for (const auto& file : files_) {
// use crc32 for now, switch to something else if needed // use crc32c for now, switch to something else if needed
// WART: The checksums are crc32c, not original crc32
size_t newlen = len + file->filename.length() + snprintf(writelen_temp, size_t newlen = len + file->filename.length() + snprintf(writelen_temp,
sizeof(writelen_temp), " crc32 %u\n", file->checksum_value); sizeof(writelen_temp), " crc32 %u\n", file->checksum_value);

Loading…
Cancel
Save