Clean up some code related to file checksums (#6861)

Summary:
* Add missing unit test for schema stability of FileChecksumGenCrc32c
  (previously was only comparing to itself)
* A lot of clarifying comments
* Add some assertions for preconditions
* Rename WritableFileWriter::CalculateFileChecksum -> UpdateFileChecksum
* Simplify FileChecksumGenCrc32c with shared functions
* Implement EndianSwapValue to replace unused EndianTransform

And incidentally since I had trouble with 'make check-format' GitHub action disagreeing with local run,
* Output full diagnostic information when 'make check-format' fails in CI
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6861

Test Plan: new unit test passes before & after other changes

Reviewed By: zhichao-cao

Differential Revision: D21667115

Pulled By: pdillinger

fbshipit-source-id: 6a99970f87605aa024fa540c78cd519ff322c3e6
main
Peter Dillinger 4 years ago committed by Facebook GitHub Bot
parent eb04bb86c6
commit c7aedf1b48
  1. 2
      .github/workflows/sanity_check.yml
  2. 4
      build_tools/format-diff.sh
  3. 5
      file/writable_file_writer.cc
  4. 2
      file/writable_file_writer.h
  5. 13
      include/rocksdb/file_checksum.h
  6. 2
      include/rocksdb/utilities/backupable_db.h
  7. 30
      table/table_test.cc
  8. 38
      util/coding.h
  9. 62
      util/file_checksum_helper.h
  10. 10
      utilities/backupable/backupable_db.cc

@ -35,7 +35,7 @@ jobs:
args: https://raw.githubusercontent.com/llvm-mirror/clang/master/tools/clang-format/clang-format-diff.py
- name: Check format
run: make check-format
run: VERBOSE_CHECK=1 make check-format
- name: Compare buckify output
run: make check-buck-targets

@ -124,6 +124,10 @@ then
elif [ $CHECK_ONLY ]
then
echo "Your change has unformatted code. Please run make format!"
if [ $VERBOSE_CHECK ]; then
clang-format --version
echo "$diffs"
fi
exit 1
fi

@ -31,7 +31,7 @@ IOStatus WritableFileWriter::Append(const Slice& data) {
rocksdb_kill_odds * REDUCE_ODDS2);
// Calculate the checksum of appended data
CalculateFileChecksum(data);
UpdateFileChecksum(data);
{
IOSTATS_TIMER_GUARD(prepare_write_nanos);
@ -225,6 +225,7 @@ IOStatus WritableFileWriter::Flush() {
std::string WritableFileWriter::GetFileChecksum() {
if (checksum_generator_ != nullptr) {
assert(checksum_finalized_);
return checksum_generator_->GetChecksum();
} else {
return kUnknownFileChecksum;
@ -346,7 +347,7 @@ IOStatus WritableFileWriter::WriteBuffered(const char* data, size_t size) {
return s;
}
void WritableFileWriter::CalculateFileChecksum(const Slice& data) {
void WritableFileWriter::UpdateFileChecksum(const Slice& data) {
if (checksum_generator_ != nullptr) {
checksum_generator_->Update(data.data(), data.size());
}

@ -51,7 +51,7 @@ class WritableFileWriter {
#endif // ROCKSDB_LITE
bool ShouldNotifyListeners() const { return !listeners_.empty(); }
void CalculateFileChecksum(const Slice& data);
void UpdateFileChecksum(const Slice& data);
std::unique_ptr<FSWritableFile> writable_file_;
std::string file_name_;

@ -24,6 +24,10 @@ struct FileChecksumGenContext {
// FileChecksumGenerator is the class to generates the checksum value
// for each file when the file is written to the file system.
// Implementations may assume that
// * Finalize is called at most once during the life of the object
// * All calls to Update come before Finalize
// * All calls to GetChecksum come after Finalize
class FileChecksumGenerator {
public:
virtual ~FileChecksumGenerator() {}
@ -36,7 +40,8 @@ class FileChecksumGenerator {
// Generate the final results if no further new data will be updated.
virtual void Finalize() = 0;
// Get the checksum
// Get the checksum. The result should not be the empty string and may
// include arbitrary bytes, including non-printable characters.
virtual std::string GetChecksum() const = 0;
// Returns a name that identifies the current file checksum function.
@ -97,9 +102,13 @@ class FileChecksumList {
// Create a new file checksum list.
extern FileChecksumList* NewFileChecksumList();
// Return a shared_ptr of the builtin Crc32 based file checksum generatory
// Return a shared_ptr of the builtin Crc32c based file checksum generatory
// factory object, which can be shared to create the Crc32c based checksum
// generator object.
// Note: this implementation is compatible with many other crc32c checksum
// implementations and uses big-endian encoding of the result, unlike most
// other crc32c checksums in RocksDB, which alter the result with
// crc32c::Mask and use little-endian encoding.
extern std::shared_ptr<FileChecksumGenFactory>
GetFileChecksumGenCrc32cFactory();

@ -89,7 +89,7 @@ struct BackupableDBOptions {
// Only used if share_table_files is set to true. If true, will consider that
// backups can come from different databases, hence a sst is not uniquely
// identifed by its name, but by the triple (file name, crc32, file length)
// identifed by its name, but by the triple (file name, crc32c, file length)
// Default: false
// Note: this is an experimental option, and you'll need to set it manually
// *turn it on only if you know what you're doing*

@ -3297,7 +3297,7 @@ TEST_P(BlockBasedTableTest, NoFileChecksum) {
ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum.c_str());
}
TEST_P(BlockBasedTableTest, Crc32FileChecksum) {
TEST_P(BlockBasedTableTest, Crc32cFileChecksum) {
FileChecksumGenCrc32cFactory* file_checksum_gen_factory =
new FileChecksumGenCrc32cFactory();
Options options;
@ -3322,12 +3322,12 @@ TEST_P(BlockBasedTableTest, Crc32FileChecksum) {
FileChecksumGenContext gen_context;
gen_context.file_name = "db/tmp";
std::unique_ptr<FileChecksumGenerator> checksum_crc32_gen1 =
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
gen_context);
FileChecksumTestHelper f(true);
f.CreateWriteableFile();
f.SetFileChecksumGenerator(checksum_crc32_gen1.release());
f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
std::unique_ptr<TableBuilder> builder;
builder.reset(ioptions.table_factory->NewTableBuilder(
TableBuilderOptions(ioptions, moptions, *comparator,
@ -3342,12 +3342,22 @@ TEST_P(BlockBasedTableTest, Crc32FileChecksum) {
f.WriteKVAndFlushTable();
ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
std::unique_ptr<FileChecksumGenerator> checksum_crc32_gen2 =
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
gen_context);
std::string checksum;
ASSERT_OK(f.CalculateFileChecksum(checksum_crc32_gen2.get(), &checksum));
ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
ASSERT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
// Unit test the generator itself for schema stability
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen3 =
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
gen_context);
const char data[] = "here is some data";
checksum_crc32c_gen3->Update(data, sizeof(data));
checksum_crc32c_gen3->Finalize();
checksum = checksum_crc32c_gen3->GetChecksum();
ASSERT_STREQ(checksum.c_str(), "\345\245\277\110");
}
// Plain table is not supported in ROCKSDB_LITE
@ -3441,7 +3451,7 @@ TEST_F(PlainTableTest, NoFileChecksum) {
EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum.c_str());
}
TEST_F(PlainTableTest, Crc32FileChecksum) {
TEST_F(PlainTableTest, Crc32cFileChecksum) {
PlainTableOptions plain_table_options;
plain_table_options.user_key_len = 20;
plain_table_options.bloom_bits_per_key = 8;
@ -3462,12 +3472,12 @@ TEST_F(PlainTableTest, Crc32FileChecksum) {
FileChecksumGenContext gen_context;
gen_context.file_name = "db/tmp";
std::unique_ptr<FileChecksumGenerator> checksum_crc32_gen1 =
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
gen_context);
FileChecksumTestHelper f(true);
f.CreateWriteableFile();
f.SetFileChecksumGenerator(checksum_crc32_gen1.release());
f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
TableBuilderOptions(
@ -3481,11 +3491,11 @@ TEST_F(PlainTableTest, Crc32FileChecksum) {
f.WriteKVAndFlushTable();
ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
std::unique_ptr<FileChecksumGenerator> checksum_crc32_gen2 =
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
gen_context);
std::string checksum;
ASSERT_OK(f.CalculateFileChecksum(checksum_crc32_gen2.get(), &checksum));
ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
EXPECT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
}

@ -7,8 +7,9 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Endian-neutral encoding:
// Encoding independent of machine byte order:
// * Fixed-length numbers are encoded with least-significant byte first
// (little endian, native order on Intel and others)
// * In addition we support variable length "varint" encoding
// * Strings are encoded prefixed by their length in varint format
@ -402,13 +403,34 @@ inline bool GetVarsignedint64(Slice* input, int64_t* value) {
}
}
// Provide an interface for platform independent endianness transformation
inline uint64_t EndianTransform(uint64_t input, size_t size) {
char* pos = reinterpret_cast<char*>(&input);
uint64_t ret_val = 0;
for (size_t i = 0; i < size; ++i) {
ret_val |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
<< ((size - i - 1) << 3));
// Swaps between big and little endian. Can be used to in combination
// with the little-endian encoding/decoding functions to encode/decode
// big endian.
template <typename T>
inline T EndianSwapValue(T v) {
static_assert(std::is_integral<T>::value, "non-integral type");
#ifdef _MSC_VER
if (sizeof(T) == 2) {
return static_cast<T>(_byteswap_ushort(static_cast<uint16_t>(v)));
} else if (sizeof(T) == 4) {
return static_cast<T>(_byteswap_ulong(static_cast<uint32_t>(v)));
} else if (sizeof(T) == 8) {
return static_cast<T>(_byteswap_uint64(static_cast<uint64_t>(v)));
}
#else
if (sizeof(T) == 2) {
return static_cast<T>(__builtin_bswap16(static_cast<uint16_t>(v)));
} else if (sizeof(T) == 4) {
return static_cast<T>(__builtin_bswap32(static_cast<uint32_t>(v)));
} else if (sizeof(T) == 8) {
return static_cast<T>(__builtin_bswap64(static_cast<uint64_t>(v)));
}
#endif
// Recognized by clang as bswap, but not by gcc :(
T ret_val = 0;
for (size_t i = 0; i < sizeof(T); ++i) {
ret_val |= ((v >> (8 * i)) & 0xff) << (8 * (sizeof(T) - 1 - i));
}
return ret_val;
}

@ -6,11 +6,12 @@
#pragma once
#include <cassert>
#include <unordered_map>
#include "port/port.h"
#include "rocksdb/file_checksum.h"
#include "rocksdb/status.h"
#include "util/coding.h"
#include "util/crc32c.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
@ -26,60 +27,19 @@ class FileChecksumGenCrc32c : public FileChecksumGenerator {
checksum_ = crc32c::Extend(checksum_, data, n);
}
void Finalize() override { checksum_str_ = Uint32ToString(checksum_); }
std::string GetChecksum() const override { return checksum_str_; }
const char* Name() const override { return "FileChecksumCrc32c"; }
// Convert a uint32_t type data into a 4 bytes string.
static std::string Uint32ToString(uint32_t v) {
std::string s;
if (port::kLittleEndian) {
s.append(reinterpret_cast<char*>(&v), sizeof(v));
} else {
char buf[sizeof(v)];
buf[0] = v & 0xff;
buf[1] = (v >> 8) & 0xff;
buf[2] = (v >> 16) & 0xff;
buf[3] = (v >> 24) & 0xff;
s.append(buf, sizeof(v));
}
size_t i = 0, j = s.size() - 1;
while (i < j) {
char tmp = s[i];
s[i] = s[j];
s[j] = tmp;
++i;
--j;
}
return s;
void Finalize() override {
assert(checksum_str_.empty());
// Store as big endian raw bytes
PutFixed32(&checksum_str_, EndianSwapValue(checksum_));
}
// Convert a 4 bytes size string into a uint32_t type data.
static uint32_t StringToUint32(std::string s) {
assert(s.size() == sizeof(uint32_t));
size_t i = 0, j = s.size() - 1;
while (i < j) {
char tmp = s[i];
s[i] = s[j];
s[j] = tmp;
++i;
--j;
}
uint32_t v = 0;
if (port::kLittleEndian) {
memcpy(&v, s.c_str(), sizeof(uint32_t));
} else {
const char* buf = s.c_str();
v |= static_cast<uint32_t>(buf[0]);
v |= (static_cast<uint32_t>(buf[1]) << 8);
v |= (static_cast<uint32_t>(buf[2]) << 16);
v |= (static_cast<uint32_t>(buf[3]) << 24);
}
return v;
std::string GetChecksum() const override {
assert(!checksum_str_.empty());
return checksum_str_;
}
const char* Name() const override { return "FileChecksumCrc32c"; }
private:
uint32_t checksum_;
std::string checksum_str_;

@ -1197,7 +1197,7 @@ Status BackupEngineImpl::RestoreDBFromBackup(const RestoreOptions& options,
std::string dst;
// 1. extract the filename
size_t slash = file.find_last_of('/');
// file will either be shared/<file>, shared_checksum/<file_crc32_size>
// file will either be shared/<file>, shared_checksum/<file_crc32c_size>
// or private/<number>/<file>
assert(slash != std::string::npos);
dst = file.substr(slash + 1);
@ -1759,8 +1759,8 @@ Slice kMetaDataPrefix("metadata ");
// <seq number>
// <metadata(literal string)> <metadata> (optional)
// <number of files>
// <file1> <crc32(literal string)> <crc32_value>
// <file2> <crc32(literal string)> <crc32_value>
// <file1> <crc32(literal string)> <crc32c_value>
// <file2> <crc32(literal string)> <crc32c_value>
// ...
Status BackupEngineImpl::BackupMeta::LoadFromFile(
const std::string& backup_dir,
@ -1808,6 +1808,7 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
std::vector<std::shared_ptr<FileInfo>> files;
// WART: The checksums are crc32c, not original crc32
Slice checksum_prefix("crc32 ");
for (uint32_t i = 0; s.ok() && i < num_files; ++i) {
@ -1921,7 +1922,8 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
}
for (const auto& file : files_) {
// use crc32 for now, switch to something else if needed
// use crc32c for now, switch to something else if needed
// WART: The checksums are crc32c, not original crc32
size_t newlen = len + file->filename.length() + snprintf(writelen_temp,
sizeof(writelen_temp), " crc32 %u\n", file->checksum_value);

Loading…
Cancel
Save