Support footer versions bigger than 1

Summary:
In this diff I add another parameter to BlockBasedTableOptions that will let users specify block based table's format. This will greatly simplify block based table's format changes in the future.

First format change that this will support is encoding decompressed size in Zlib and BZip2 blocks. This diff is blocking https://reviews.facebook.net/D31311.

Test Plan: Added a unit tests. More tests to come as part of https://reviews.facebook.net/D31311.

Reviewers: dhruba, MarkCallaghan, yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D31383
main
Igor Canadi 10 years ago
parent 53f615df6a
commit 96b8240bc5
  1. 13
      include/rocksdb/table.h
  2. 25
      table/block_based_table_builder.cc
  3. 6
      table/block_based_table_factory.cc
  4. 10
      table/block_based_table_reader.cc
  5. 2
      table/cuckoo_table_builder.cc
  6. 91
      table/format.cc
  7. 47
      table/format.h
  8. 12
      table/meta_blocks.cc
  9. 2
      table/plain_table_builder.cc
  10. 31
      table/table_test.cc

@ -125,6 +125,19 @@ struct BlockBasedTableOptions {
// If true, place whole keys in the filter (not just prefixes).
// This must generally be true for gets to be efficient.
bool whole_key_filtering = true;
// For more details on BlockBasedTable's formats, see FORMAT-CHANGES.md
// We currently have two versions:
// 0 -- This version is currently written out by all RocksDB's versions by
// default. Can be read by really old RocksDB's. Doesn't support changing
// checksum (default is CRC32).
// 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
// checksum, like xxHash. It is written by RocksDB when
// BlockBasedTableOptions::checksum is something other than kCRC32c. (version
// 0 is silently upconverted)
// This only affects newly written tables. When reading exising tables, the
// information about version is read from the footer.
uint32_t format_version = 0;
};
// Table Properties that are specific to block-based table properties.

@ -472,9 +472,20 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
const BlockBasedTableOptions& table_options,
const InternalKeyComparator& internal_comparator, WritableFile* file,
const CompressionType compression_type,
const CompressionOptions& compression_opts)
: rep_(new Rep(ioptions, table_options, internal_comparator,
file, compression_type, compression_opts)) {
const CompressionOptions& compression_opts) {
BlockBasedTableOptions sanitized_table_options(table_options);
if (sanitized_table_options.format_version == 0 &&
sanitized_table_options.checksum != kCRC32c) {
Log(InfoLogLevel::WARN_LEVEL, ioptions.info_log,
"Silently converting format_version to 1 because checksum is "
"non-default");
// silently convert format_version to 1 to keep consistent with current
// behavior
sanitized_table_options.format_version = 1;
}
rep_ = new Rep(ioptions, sanitized_table_options, internal_comparator, file,
compression_type, compression_opts);
if (rep_->filter_block != nullptr) {
rep_->filter_block->StartBlock(0);
}
@ -771,9 +782,13 @@ Status BlockBasedTableBuilder::Finish() {
// TODO(icanadi) at some point in the future, when we're absolutely sure
// nobody will roll back to RocksDB 2.x versions, retire the legacy magic
// number and always write new table files with new magic number
bool legacy = (r->table_options.checksum == kCRC32c);
bool legacy = (r->table_options.format_version == 0);
// this is guaranteed by BlockBasedTableBuilder's constructor
assert(r->table_options.checksum == kCRC32c ||
r->table_options.format_version != 0);
Footer footer(legacy ? kLegacyBlockBasedTableMagicNumber
: kBlockBasedTableMagicNumber);
: kBlockBasedTableMagicNumber,
r->table_options.format_version);
footer.set_metaindex_handle(metaindex_block_handle);
footer.set_index_handle(index_block_handle);
footer.set_checksum(r->table_options.checksum);

@ -76,6 +76,10 @@ Status BlockBasedTableFactory::SanitizeOptions(
return Status::InvalidArgument("Enable cache_index_and_filter_blocks, "
", but block cache is disabled");
}
if (table_options_.format_version > 1) {
return Status::InvalidArgument(
"We currently only support versions 0 and 1");
}
return Status::OK();
}
@ -135,6 +139,8 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
ret.append(buffer);
snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n",
table_options_.whole_key_filtering);
snprintf(buffer, kBufferSize, " format_version: %d\n",
table_options_.format_version);
ret.append(buffer);
return ret;
}

@ -436,11 +436,17 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
unique_ptr<TableReader>* table_reader) {
table_reader->reset();
Footer footer(kBlockBasedTableMagicNumber);
auto s = ReadFooterFromFile(file.get(), file_size, &footer);
Footer footer;
auto s = ReadFooterFromFile(file.get(), file_size, &footer,
kBlockBasedTableMagicNumber);
if (!s.ok()) {
return s;
}
if (footer.version() > 1) {
return Status::Corruption(
"Unknown Footer version. Maybe this file was created with too new "
"version of RocksDB?");
}
// We've successfully read the footer and the index block: we're
// ready to serve requests.

@ -377,7 +377,7 @@ Status CuckooTableBuilder::Finish() {
return s;
}
Footer footer(kCuckooTableMagicNumber);
Footer footer(kCuckooTableMagicNumber, 1);
footer.set_metaindex_handle(meta_index_block_handle);
footer.set_index_handle(BlockHandle::NullBlockHandle());
std::string footer_encoding;

@ -72,6 +72,23 @@ std::string BlockHandle::ToString(bool hex) const {
const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
namespace {
inline bool IsLegacyFooterFormat(uint64_t magic_number) {
return magic_number == kLegacyBlockBasedTableMagicNumber ||
magic_number == kLegacyPlainTableMagicNumber;
}
inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
if (magic_number == kLegacyBlockBasedTableMagicNumber) {
return kBlockBasedTableMagicNumber;
}
if (magic_number == kLegacyPlainTableMagicNumber) {
return kPlainTableMagicNumber;
}
assert(false);
return 0;
}
} // namespace
// legacy footer format:
// metaindex handle (varint64 offset, varint64 size)
// index handle (varint64 offset, varint64 size)
@ -85,7 +102,8 @@ const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
// footer version (4 bytes)
// table_magic_number (8 bytes)
void Footer::EncodeTo(std::string* dst) const {
if (version() == kLegacyFooter) {
assert(HasInitializedTableMagicNumber());
if (IsLegacyFooterFormat(table_magic_number())) {
// has to be default checksum with legacy footer
assert(checksum_ == kCRC32c);
const size_t original_size = dst->size();
@ -100,39 +118,24 @@ void Footer::EncodeTo(std::string* dst) const {
dst->push_back(static_cast<char>(checksum_));
metaindex_handle_.EncodeTo(dst);
index_handle_.EncodeTo(dst);
dst->resize(original_size + kVersion1EncodedLength - 12); // Padding
PutFixed32(dst, kFooterVersion);
dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding
PutFixed32(dst, version());
PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
assert(dst->size() == original_size + kVersion1EncodedLength);
assert(dst->size() == original_size + kNewVersionsEncodedLength);
}
}
namespace {
inline bool IsLegacyFooterFormat(uint64_t magic_number) {
return magic_number == kLegacyBlockBasedTableMagicNumber ||
magic_number == kLegacyPlainTableMagicNumber;
}
inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
if (magic_number == kLegacyBlockBasedTableMagicNumber) {
return kBlockBasedTableMagicNumber;
}
if (magic_number == kLegacyPlainTableMagicNumber) {
return kPlainTableMagicNumber;
}
assert(false);
return 0;
}
} // namespace
Footer::Footer(uint64_t _table_magic_number)
: version_(IsLegacyFooterFormat(_table_magic_number) ? kLegacyFooter
: kFooterVersion),
Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
: version_(_version),
checksum_(kCRC32c),
table_magic_number_(_table_magic_number) {}
table_magic_number_(_table_magic_number) {
// This should be guaranteed by constructor callers
assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
}
Status Footer::DecodeFrom(Slice* input) {
assert(!HasInitializedTableMagicNumber());
assert(input != nullptr);
assert(input->size() >= kMinEncodedLength);
@ -148,36 +151,23 @@ Status Footer::DecodeFrom(Slice* input) {
if (legacy) {
magic = UpconvertLegacyFooterFormat(magic);
}
if (HasInitializedTableMagicNumber()) {
if (magic != table_magic_number()) {
char buffer[80];
snprintf(buffer, sizeof(buffer) - 1,
"not an sstable (bad magic number --- %lx)",
(long)magic);
return Status::Corruption(buffer);
}
} else {
set_table_magic_number(magic);
}
if (legacy) {
// The size is already asserted to be at least kMinEncodedLength
// at the beginning of the function
input->remove_prefix(input->size() - kVersion0EncodedLength);
version_ = kLegacyFooter;
version_ = 0 /* legacy */;
checksum_ = kCRC32c;
} else {
version_ = DecodeFixed32(magic_ptr - 4);
if (version_ != kFooterVersion) {
return Status::Corruption("bad footer version");
}
// Footer version 1 will always occupy exactly this many bytes.
// Footer version 1 and higher will always occupy exactly this many bytes.
// It consists of the checksum type, two block handles, padding,
// a version number, and a magic number
if (input->size() < kVersion1EncodedLength) {
if (input->size() < kNewVersionsEncodedLength) {
return Status::Corruption("input is too short to be an sstable");
} else {
input->remove_prefix(input->size() - kVersion1EncodedLength);
input->remove_prefix(input->size() - kNewVersionsEncodedLength);
}
uint32_t chksum;
if (!GetVarint32(input, &chksum)) {
@ -219,9 +209,8 @@ std::string Footer::ToString() const {
return result;
}
Status ReadFooterFromFile(RandomAccessFile* file,
uint64_t file_size,
Footer* footer) {
Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size,
Footer* footer, uint64_t enforce_table_magic_number) {
if (file_size < Footer::kMinEncodedLength) {
return Status::Corruption("file is too short to be an sstable");
}
@ -242,7 +231,15 @@ Status ReadFooterFromFile(RandomAccessFile* file,
return Status::Corruption("file is too short to be an sstable");
}
return footer->DecodeFrom(&footer_input);
s = footer->DecodeFrom(&footer_input);
if (!s.ok()) {
return s;
}
if (enforce_table_magic_number != 0 &&
enforce_table_magic_number != footer->table_magic_number()) {
return Status::Corruption("Bad table magic number");
}
return Status::OK();
}
// Without anonymous namespace here, we fail the warning -Wmissing-prototypes

@ -72,12 +72,13 @@ class Footer {
// Constructs a footer without specifying its table magic number.
// In such case, the table magic number of such footer should be
// initialized via @ReadFooterFromFile().
Footer() : Footer(kInvalidTableMagicNumber) {}
// Use this when you plan to load Footer with DecodeFrom(). Never use this
// when you plan to EncodeTo.
Footer() : Footer(kInvalidTableMagicNumber, 0) {}
// @table_magic_number serves two purposes:
// 1. Identify different types of the tables.
// 2. Help us to identify if a given file is a valid sst.
explicit Footer(uint64_t table_magic_number);
// Use this constructor when you plan to write out the footer using
// EncodeTo(). Never use this constructor with DecodeFrom().
Footer(uint64_t table_magic_number, uint32_t version);
// The version of the footer in this file
uint32_t version() const { return version_; }
@ -97,20 +98,13 @@ class Footer {
uint64_t table_magic_number() const { return table_magic_number_; }
// The version of Footer we encode
enum {
kLegacyFooter = 0,
kFooterVersion = 1,
};
void EncodeTo(std::string* dst) const;
// Set the current footer based on the input slice. If table_magic_number_
// is not set (i.e., HasInitializedTableMagicNumber() is true), then this
// function will also initialize table_magic_number_. Otherwise, this
// function will verify whether the magic number specified in the input
// slice matches table_magic_number_ and update the current footer only
// when the test passes.
// Set the current footer based on the input slice.
//
// REQUIRES: table_magic_number_ is not set (i.e.,
// HasInitializedTableMagicNumber() is true). The function will initialize the
// magic number
Status DecodeFrom(Slice* input);
// Encoded length of a Footer. Note that the serialization of a Footer will
@ -121,13 +115,12 @@ class Footer {
// Footer version 0 (legacy) will always occupy exactly this many bytes.
// It consists of two block handles, padding, and a magic number.
kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8,
// Footer version 1 will always occupy exactly this many bytes.
// It consists of the checksum type, two block handles, padding,
// a version number, and a magic number
kVersion1EncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8,
// Footer of versions 1 and higher will always occupy exactly this many
// bytes. It consists of the checksum type, two block handles, padding,
// a version number (bigger than 1), and a magic number
kNewVersionsEncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8,
kMinEncodedLength = kVersion0EncodedLength,
kMaxEncodedLength = kVersion1EncodedLength
kMaxEncodedLength = kNewVersionsEncodedLength,
};
static const uint64_t kInvalidTableMagicNumber = 0;
@ -156,9 +149,11 @@ class Footer {
};
// Read the footer from file
Status ReadFooterFromFile(RandomAccessFile* file,
uint64_t file_size,
Footer* footer);
// If enforce_table_magic_number != 0, ReadFooterFromFile() will return
// corruption if table_magic number is not equal to enforce_table_magic_number
Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size,
Footer* footer,
uint64_t enforce_table_magic_number = 0);
// 1-byte type + 32-bit crc
static const size_t kBlockTrailerSize = 5;

@ -220,8 +220,8 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
uint64_t table_magic_number, Env* env,
Logger* info_log, TableProperties** properties) {
// -- Read metaindex block
Footer footer(table_magic_number);
auto s = ReadFooterFromFile(file, file_size, &footer);
Footer footer;
auto s = ReadFooterFromFile(file, file_size, &footer, table_magic_number);
if (!s.ok()) {
return s;
}
@ -274,8 +274,8 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
uint64_t table_magic_number, Env* env,
const std::string& meta_block_name,
BlockHandle* block_handle) {
Footer footer(table_magic_number);
auto s = ReadFooterFromFile(file, file_size, &footer);
Footer footer;
auto s = ReadFooterFromFile(file, file_size, &footer, table_magic_number);
if (!s.ok()) {
return s;
}
@ -302,8 +302,8 @@ Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
const std::string& meta_block_name,
BlockContents* contents) {
Status status;
Footer footer(table_magic_number);
status = ReadFooterFromFile(file, file_size, &footer);
Footer footer;
status = ReadFooterFromFile(file, file_size, &footer, table_magic_number);
if (!status.ok()) {
return status;
}

@ -258,7 +258,7 @@ Status PlainTableBuilder::Finish() {
// Write Footer
// no need to write out new footer if we're using default checksum
Footer footer(kLegacyPlainTableMagicNumber);
Footer footer(kLegacyPlainTableMagicNumber, 0);
footer.set_metaindex_handle(metaindex_block_handle);
footer.set_index_handle(BlockHandle::NullBlockHandle());
std::string footer_encoding;

@ -1943,7 +1943,7 @@ TEST(Harness, FooterTests) {
{
// upconvert legacy block based
std::string encoded;
Footer footer(kLegacyBlockBasedTableMagicNumber);
Footer footer(kLegacyBlockBasedTableMagicNumber, 0);
BlockHandle meta_index(10, 5), index(20, 15);
footer.set_metaindex_handle(meta_index);
footer.set_index_handle(index);
@ -1957,11 +1957,12 @@ TEST(Harness, FooterTests) {
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.version(), 0U);
}
{
// xxhash block based
std::string encoded;
Footer footer(kBlockBasedTableMagicNumber);
Footer footer(kBlockBasedTableMagicNumber, 1);
BlockHandle meta_index(10, 5), index(20, 15);
footer.set_metaindex_handle(meta_index);
footer.set_index_handle(index);
@ -1976,11 +1977,12 @@ TEST(Harness, FooterTests) {
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.version(), 1U);
}
{
// upconvert legacy plain table
std::string encoded;
Footer footer(kLegacyPlainTableMagicNumber);
Footer footer(kLegacyPlainTableMagicNumber, 0);
BlockHandle meta_index(10, 5), index(20, 15);
footer.set_metaindex_handle(meta_index);
footer.set_index_handle(index);
@ -1994,11 +1996,12 @@ TEST(Harness, FooterTests) {
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.version(), 0U);
}
{
// xxhash block based
std::string encoded;
Footer footer(kPlainTableMagicNumber);
Footer footer(kPlainTableMagicNumber, 1);
BlockHandle meta_index(10, 5), index(20, 15);
footer.set_metaindex_handle(meta_index);
footer.set_index_handle(index);
@ -2013,6 +2016,26 @@ TEST(Harness, FooterTests) {
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.version(), 1U);
}
{
// version == 2
std::string encoded;
Footer footer(kBlockBasedTableMagicNumber, 2);
BlockHandle meta_index(10, 5), index(20, 15);
footer.set_metaindex_handle(meta_index);
footer.set_index_handle(index);
footer.EncodeTo(&encoded);
Footer decoded_footer;
Slice encoded_slice(encoded);
decoded_footer.DecodeFrom(&encoded_slice);
ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.version(), 2U);
}
}

Loading…
Cancel
Save