More refactoring ahead of footer & meta changes (#9240)

Summary:
I'm working on a new format_version=6 to support context
checksum (https://github.com/facebook/rocksdb/issues/9058) and this includes much of the refactoring and test
updates to support that change.

Test coverage data and manual inspection agree on dead code in
block_based_table_reader.cc (removed).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9240

Test Plan:
tests enhanced to cover more cases etc.

Extreme case performance testing indicates small % regression in fillseq (w/ compaction), though CPU profile etc. doesn't suggest any explanation. There is enhanced correctness checking in Footer::DecodeFrom, but this should be negligible.

TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=1 --disable_wal={false,true}

(Each is ops/s averaged over 50 runs, run simultaneously with competing configuration for load fairness)
Before w/ wal: 454512
After w/ wal: 444820 (-2.1%)
Before w/o wal: 1004560
After w/o wal: 998897 (-0.6%)

Since this doesn't modify WAL code, one would expect real effects to be larger in w/o wal case.

This regression will be corrected in a follow-up PR.

Reviewed By: ajkr

Differential Revision: D32813769

Pulled By: pdillinger

fbshipit-source-id: 444a244eabf3825cd329b7d1b150cddce320862f
main
Peter Dillinger 3 years ago committed by Facebook GitHub Bot
parent f57745814f
commit 653c392e47
  1. 6
      db/column_family_test.cc
  2. 2
      db/comparator_db_test.cc
  3. 2
      db/corruption_test.cc
  4. 10
      db/db_basic_test.cc
  5. 1
      db/db_block_cache_test.cc
  6. 7
      db/db_bloom_filter_test.cc
  7. 6
      db/db_test_util.cc
  8. 1
      db/db_test_util.h
  9. 60
      db/external_sst_file_test.cc
  10. 6
      db/version_set.cc
  11. 1
      include/rocksdb/stats_history.h
  12. 10
      include/rocksdb/table.h
  13. 2
      table/adaptive/adaptive_table_factory.cc
  14. 22
      table/block_based/block_based_table_builder.cc
  15. 2
      table/block_based/block_based_table_factory.cc
  16. 53
      table/block_based/block_based_table_reader.cc
  17. 1
      table/block_based/block_based_table_reader.h
  18. 17
      table/block_based/partitioned_filter_block_test.cc
  19. 8
      table/block_fetcher.cc
  20. 1
      table/block_fetcher.h
  21. 13
      table/cuckoo/cuckoo_table_builder.cc
  22. 193
      table/format.cc
  23. 167
      table/format.h
  24. 16
      table/meta_blocks.cc
  25. 6
      table/meta_blocks.h
  26. 12
      table/plain/plain_table_builder.cc
  27. 2
      table/sst_file_dumper.cc
  28. 144
      table/table_test.cc
  29. 7
      test_util/testutil.cc
  30. 2
      test_util/testutil.h
  31. 19
      util/cast_util.h
  32. 2
      util/coding.h

@ -554,7 +554,7 @@ class ColumnFamilyTest
INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest,
testing::Values(test::kDefaultFormatVersion));
INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest,
testing::Values(test::kLatestFormatVersion));
testing::Values(kLatestFormatVersion));
TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) {
for (int iter = 0; iter < 3; ++iter) {
@ -746,8 +746,8 @@ INSTANTIATE_TEST_CASE_P(
std::make_tuple(test::kDefaultFormatVersion, false)));
INSTANTIATE_TEST_CASE_P(
FormatLatest, FlushEmptyCFTestWithParam,
testing::Values(std::make_tuple(test::kLatestFormatVersion, true),
std::make_tuple(test::kLatestFormatVersion, false)));
testing::Values(std::make_tuple(kLatestFormatVersion, true),
std::make_tuple(kLatestFormatVersion, false)));
TEST_P(ColumnFamilyTest, AddDrop) {
Open();

@ -317,7 +317,7 @@ class ComparatorDBTest
INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest,
testing::Values(test::kDefaultFormatVersion));
INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest,
testing::Values(test::kLatestFormatVersion));
testing::Values(kLatestFormatVersion));
TEST_P(ComparatorDBTest, Bytewise) {
for (int rand_seed = 301; rand_seed < 306; rand_seed++) {

@ -546,7 +546,7 @@ TEST_F(CorruptionTest, RangeDeletionCorrupted) {
BlockHandle range_del_handle;
ASSERT_OK(FindMetaBlockInFile(
file_reader.get(), file_size, kBlockBasedTableMagicNumber,
ImmutableOptions(options_), kRangeDelBlock, &range_del_handle));
ImmutableOptions(options_), kRangeDelBlockName, &range_del_handle));
ASSERT_OK(TryReopen());
ASSERT_OK(test::CorruptFile(env_, filename,

@ -15,6 +15,7 @@
#include "rocksdb/flush_block_policy.h"
#include "rocksdb/merge_operator.h"
#include "rocksdb/perf_context.h"
#include "rocksdb/table.h"
#include "rocksdb/utilities/debug.h"
#include "table/block_based/block_based_table_reader.h"
#include "table/block_based/block_builder.h"
@ -972,8 +973,15 @@ TEST_F(DBBasicTest, MultiGetEmpty) {
} while (ChangeCompactOptions());
}
TEST_F(DBBasicTest, ChecksumTest) {
class DBBlockChecksumTest : public DBBasicTest,
public testing::WithParamInterface<uint32_t> {};
INSTANTIATE_TEST_CASE_P(FormatVersions, DBBlockChecksumTest,
testing::ValuesIn(test::kFooterFormatVersionsToTest));
TEST_P(DBBlockChecksumTest, BlockChecksumTest) {
BlockBasedTableOptions table_options;
table_options.format_version = GetParam();
Options options = CurrentOptions();
const int kNumPerFile = 2;

@ -15,6 +15,7 @@
#include "db/column_family.h"
#include "db/db_test_util.h"
#include "port/stack_trace.h"
#include "rocksdb/persistent_cache.h"
#include "rocksdb/statistics.h"
#include "rocksdb/table.h"
#include "util/compression.h"

@ -551,10 +551,9 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
FormatLatest, DBBloomFilterTestWithParam,
::testing::Values(
std::make_tuple(BFP::kDeprecatedBlock, false,
test::kLatestFormatVersion),
std::make_tuple(BFP::kAutoBloom, true, test::kLatestFormatVersion),
std::make_tuple(BFP::kAutoBloom, false, test::kLatestFormatVersion)));
std::make_tuple(BFP::kDeprecatedBlock, false, kLatestFormatVersion),
std::make_tuple(BFP::kAutoBloom, true, kLatestFormatVersion),
std::make_tuple(BFP::kAutoBloom, false, kLatestFormatVersion)));
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
TEST_F(DBBloomFilterTest, BloomFilterRate) {

@ -15,6 +15,7 @@
#include "rocksdb/env_encryption.h"
#include "rocksdb/unique_id.h"
#include "rocksdb/utilities/object_registry.h"
#include "table/format.h"
#include "util/random.h"
namespace ROCKSDB_NAMESPACE {
@ -516,6 +517,11 @@ Options DBTestBase::GetOptions(
table_options.index_block_restart_interval = 8;
break;
}
case kBlockBasedTableWithLatestFormat: {
// In case different from default
table_options.format_version = kLatestFormatVersion;
break;
}
case kOptimizeFiltersForHits: {
options.optimize_filters_for_hits = true;
set_block_based_table_factory = true;

@ -867,6 +867,7 @@ class DBTestBase : public testing::Test {
kBlockBasedTableWithIndexRestartInterval,
kBlockBasedTableWithPartitionedIndex,
kBlockBasedTableWithPartitionedIndexFormat4,
kBlockBasedTableWithLatestFormat,
kPartitionedFilterWithNewTableReaderForCompactions,
kUniversalSubcompactions,
kUnorderedWrite,

@ -41,16 +41,33 @@ class ExternalSSTTestEnv : public EnvWrapper {
bool fail_link_;
};
class ExternalSSTFileTestBase : public DBTestBase {
public:
ExternalSSTFileTestBase()
: DBTestBase("external_sst_file_test", /*env_do_fsync=*/true) {
sst_files_dir_ = dbname_ + "/sst_files/";
DestroyAndRecreateExternalSSTFilesDir();
}
void DestroyAndRecreateExternalSSTFilesDir() {
ASSERT_OK(DestroyDir(env_, sst_files_dir_));
ASSERT_OK(env_->CreateDir(sst_files_dir_));
}
~ExternalSSTFileTestBase() override {
DestroyDir(env_, sst_files_dir_).PermitUncheckedError();
}
protected:
std::string sst_files_dir_;
};
class ExternSSTFileLinkFailFallbackTest
: public DBTestBase,
: public ExternalSSTFileTestBase,
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
public:
ExternSSTFileLinkFailFallbackTest()
: DBTestBase("external_sst_file_test", /*env_do_fsync=*/true),
test_env_(new ExternalSSTTestEnv(env_, true)) {
sst_files_dir_ = dbname_ + "/sst_files/";
EXPECT_EQ(DestroyDir(env_, sst_files_dir_), Status::OK());
EXPECT_EQ(env_->CreateDir(sst_files_dir_), Status::OK());
: test_env_(new ExternalSSTTestEnv(env_, true)) {
options_ = CurrentOptions();
options_.disable_auto_compactions = true;
options_.env = test_env_;
@ -65,25 +82,15 @@ class ExternSSTFileLinkFailFallbackTest
}
protected:
std::string sst_files_dir_;
Options options_;
ExternalSSTTestEnv* test_env_;
};
class ExternalSSTFileTest
: public DBTestBase,
: public ExternalSSTFileTestBase,
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
public:
ExternalSSTFileTest()
: DBTestBase("external_sst_file_test", /*env_do_fsync=*/true) {
sst_files_dir_ = dbname_ + "/sst_files/";
DestroyAndRecreateExternalSSTFilesDir();
}
void DestroyAndRecreateExternalSSTFilesDir() {
ASSERT_OK(DestroyDir(env_, sst_files_dir_));
ASSERT_OK(env_->CreateDir(sst_files_dir_));
}
ExternalSSTFileTest() {}
Status GenerateOneExternalFile(
const Options& options, ColumnFamilyHandle* cfh,
@ -282,13 +289,8 @@ class ExternalSSTFileTest
return db_->IngestExternalFile(files, opts);
}
~ExternalSSTFileTest() override {
DestroyDir(env_, sst_files_dir_).PermitUncheckedError();
}
protected:
int last_file_id_ = 0;
std::string sst_files_dir_;
};
TEST_F(ExternalSSTFileTest, Basic) {
@ -2382,10 +2384,18 @@ TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) {
ASSERT_EQ(1, num_compression_dicts);
}
class ExternalSSTBlockChecksumTest
: public ExternalSSTFileTestBase,
public testing::WithParamInterface<uint32_t> {};
INSTANTIATE_TEST_CASE_P(FormatVersions, ExternalSSTBlockChecksumTest,
testing::ValuesIn(test::kFooterFormatVersionsToTest));
// Very slow, not worth the cost to run regularly
TEST_F(ExternalSSTFileTest, DISABLED_HugeBlockChecksum) {
TEST_P(ExternalSSTBlockChecksumTest, DISABLED_HugeBlockChecksum) {
BlockBasedTableOptions table_options;
table_options.format_version = GetParam();
for (auto t : GetSupportedChecksums()) {
BlockBasedTableOptions table_options;
table_options.checksum = t;
Options options = CurrentOptions();
options.table_factory.reset(NewBlockBasedTableFactory(table_options));

@ -1271,8 +1271,8 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
return s;
}
// By setting the magic number to kInvalidTableMagicNumber, we can by
// pass the magic number check in the footer.
// By setting the magic number to kNullTableMagicNumber, we can bypass
// the magic number check in the footer.
std::unique_ptr<RandomAccessFileReader> file_reader(
new RandomAccessFileReader(
std::move(file), file_name, nullptr /* env */, io_tracer_,
@ -1281,7 +1281,7 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
std::unique_ptr<TableProperties> props;
s = ReadTableProperties(
file_reader.get(), file_meta->fd.GetFileSize(),
Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions,
Footer::kNullTableMagicNumber /* table's magic number */, *ioptions,
&props);
if (!s.ok()) {
return s;

@ -53,6 +53,7 @@ class StatsHistoryIterator {
// REQUIRES: Valid()
virtual uint64_t GetStatsTime() const = 0;
// DEPRECATED (was never used)
virtual int GetFormatVersion() const { return -1; }
// Return the current stats history as an std::map which specifies the

@ -44,6 +44,9 @@ class WritableFileWriter;
struct ConfigOptions;
struct EnvOptions;
// Types of checksums to use for checking integrity of logical blocks within
// files. All checksums currently use 32 bits of checking power (1 in 4B
// chance of failing to detect random corruption).
enum ChecksumType : char {
kNoChecksum = 0x0,
kCRC32c = 0x1,
@ -390,10 +393,9 @@ struct BlockBasedTableOptions {
// Default: 0 (disabled)
uint32_t read_amp_bytes_per_bit = 0;
// We currently have five versions:
// 0 -- This version is currently written out by all RocksDB's versions by
// default. Can be read by really old RocksDB's. Doesn't support changing
// checksum (default is CRC32).
// We currently have these versions:
// 0 -- This version can be read by really old RocksDB's. Doesn't support
// changing checksum type (default is CRC32).
// 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
// checksum, like xxHash. It is written by RocksDB when
// BlockBasedTableOptions::checksum is something other than kCRC32c. (version

@ -58,7 +58,7 @@ Status AdaptiveTableFactory::NewTableReader(
return plain_table_factory_->NewTableReader(
table_reader_options, std::move(file), file_size, table);
} else if (footer.table_magic_number() == kBlockBasedTableMagicNumber ||
footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
return block_based_table_factory_->NewTableReader(
ro, table_reader_options, std::move(file), file_size, table,
prefetch_index_and_filter_in_cache);

@ -1744,7 +1744,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
}
#endif // !NDEBUG
const std::string* properties_block_meta = &kPropertiesBlock;
const std::string* properties_block_meta = &kPropertiesBlockName;
TEST_SYNC_POINT_CALLBACK(
"BlockBasedTableBuilder::WritePropertiesBlock:Meta",
&properties_block_meta);
@ -1769,7 +1769,7 @@ void BlockBasedTableBuilder::WriteCompressionDictBlock(
#endif // NDEBUG
}
if (ok()) {
meta_index_builder->Add(kCompressionDictBlock,
meta_index_builder->Add(kCompressionDictBlockName,
compression_dict_block_handle);
}
}
@ -1781,7 +1781,7 @@ void BlockBasedTableBuilder::WriteRangeDelBlock(
BlockHandle range_del_block_handle;
WriteRawBlock(rep_->range_del_block.Finish(), kNoCompression,
&range_del_block_handle, BlockType::kRangeDeletion);
meta_index_builder->Add(kRangeDelBlock, range_del_block_handle);
meta_index_builder->Add(kRangeDelBlockName, range_del_block_handle);
}
}
@ -1799,14 +1799,16 @@ void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
// this is guaranteed by BlockBasedTableBuilder's constructor
assert(r->table_options.checksum == kCRC32c ||
r->table_options.format_version != 0);
Footer footer(
legacy ? kLegacyBlockBasedTableMagicNumber : kBlockBasedTableMagicNumber,
r->table_options.format_version);
footer.set_metaindex_handle(metaindex_block_handle);
footer.set_index_handle(index_block_handle);
footer.set_checksum(r->table_options.checksum);
Footer footer;
footer
.set_table_magic_number(legacy ? kLegacyBlockBasedTableMagicNumber
: kBlockBasedTableMagicNumber)
.set_format_version(r->table_options.format_version)
.set_metaindex_handle(metaindex_block_handle)
.set_index_handle(index_block_handle)
.set_checksum_type(r->table_options.checksum);
std::string footer_encoding;
footer.EncodeTo(&footer_encoding);
footer.EncodeTo(&footer_encoding, r->get_offset());
assert(ok());
IOStatus ios = r->file->Append(footer_encoding);
if (ios.ok()) {

@ -650,7 +650,7 @@ Status BlockBasedTableFactory::ValidateOptions(
"Enable pin_l0_filter_and_index_blocks_in_cache, "
", but block cache is disabled");
}
if (!BlockBasedTableSupportedVersion(table_options_.format_version)) {
if (!IsSupportedFormatVersion(table_options_.format_version)) {
return Status::InvalidArgument(
"Unsupported BlockBasedTable format_version. Please check "
"include/rocksdb/table.h for more info");

@ -600,7 +600,7 @@ Status BlockBasedTable::Open(
if (!s.ok()) {
return s;
}
if (!BlockBasedTableSupportedVersion(footer.version())) {
if (!IsSupportedFormatVersion(footer.format_version())) {
return Status::Corruption(
"Unknown Footer version. Maybe this file was created with newer "
"version of RocksDB?");
@ -757,7 +757,7 @@ Status BlockBasedTable::ReadPropertiesBlock(
InternalIterator* meta_iter, const SequenceNumber largest_seqno) {
Status s;
BlockHandle handle;
s = FindOptionalMetaBlock(meta_iter, kPropertiesBlock, &handle);
s = FindOptionalMetaBlock(meta_iter, kPropertiesBlockName, &handle);
if (!s.ok()) {
ROCKS_LOG_WARN(rep_->ioptions.logger,
@ -856,7 +856,7 @@ Status BlockBasedTable::ReadRangeDelBlock(
BlockCacheLookupContext* lookup_context) {
Status s;
BlockHandle range_del_handle;
s = FindOptionalMetaBlock(meta_iter, kRangeDelBlock, &range_del_handle);
s = FindOptionalMetaBlock(meta_iter, kRangeDelBlockName, &range_del_handle);
if (!s.ok()) {
ROCKS_LOG_WARN(
rep_->ioptions.logger,
@ -925,7 +925,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
rep_->index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
// Find compression dictionary handle
s = FindOptionalMetaBlock(meta_iter, kCompressionDictBlock,
s = FindOptionalMetaBlock(meta_iter, kCompressionDictBlockName,
&rep_->compression_dict_handle);
if (!s.ok()) {
return s;
@ -1808,7 +1808,7 @@ void BlockBasedTable::RetrieveMultipleBlocks(
// begin address of each read request, we need to add the offset
// in each read request. Checksum is stored in the block trailer,
// beyond the payload size.
s = VerifyBlockChecksum(footer.checksum(), data + req_offset,
s = VerifyBlockChecksum(footer.checksum_type(), data + req_offset,
handle.size(), rep_->file->file_name(),
handle.offset());
TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
@ -1875,9 +1875,9 @@ void BlockBasedTable::RetrieveMultipleBlocks(
if (compression_type != kNoCompression) {
UncompressionContext context(compression_type);
UncompressionInfo info(context, uncompression_dict, compression_type);
s = UncompressBlockContents(info, req.result.data() + req_offset,
handle.size(), &contents, footer.version(),
rep_->ioptions, memory_allocator);
s = UncompressBlockContents(
info, req.result.data() + req_offset, handle.size(), &contents,
footer.format_version(), rep_->ioptions, memory_allocator);
} else {
// There are two cases here:
// 1) caller uses the shared buffer (scratch or direct io buffer);
@ -3008,15 +3008,15 @@ BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
return BlockType::kFilter;
}
if (meta_block_name == kPropertiesBlock) {
if (meta_block_name == kPropertiesBlockName) {
return BlockType::kProperties;
}
if (meta_block_name == kCompressionDictBlock) {
if (meta_block_name == kCompressionDictBlockName) {
return BlockType::kCompressionDictionary;
}
if (meta_block_name == kRangeDelBlock) {
if (meta_block_name == kRangeDelBlockName) {
return BlockType::kRangeDeletion;
}
@ -3045,7 +3045,7 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks(
s = handle.DecodeFrom(&input);
BlockContents contents;
const Slice meta_block_name = index_iter->key();
if (meta_block_name == kPropertiesBlock) {
if (meta_block_name == kPropertiesBlockName) {
// Unfortunate special handling for properties block checksum w/
// global seqno
std::unique_ptr<TableProperties> table_properties;
@ -3111,8 +3111,8 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
// 5. index_type
Status BlockBasedTable::CreateIndexReader(
const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
bool pin, BlockCacheLookupContext* lookup_context,
InternalIterator* meta_iter, bool use_cache, bool prefetch, bool pin,
BlockCacheLookupContext* lookup_context,
std::unique_ptr<IndexReader>* index_reader) {
// kHashSearch requires non-empty prefix_extractor but bypass checking
// prefix_extractor here since we have no access to MutableCFOptions.
@ -3136,25 +3136,12 @@ Status BlockBasedTable::CreateIndexReader(
case BlockBasedTableOptions::kHashSearch: {
std::unique_ptr<Block> metaindex_guard;
std::unique_ptr<InternalIterator> metaindex_iter_guard;
auto meta_index_iter = preloaded_meta_index_iter;
bool should_fallback = false;
if (rep_->internal_prefix_transform.get() == nullptr) {
ROCKS_LOG_WARN(rep_->ioptions.logger,
"No prefix extractor passed in. Fall back to binary"
" search index.");
should_fallback = true;
} else if (meta_index_iter == nullptr) {
auto s = ReadMetaIndexBlock(ro, prefetch_buffer, &metaindex_guard,
&metaindex_iter_guard);
if (!s.ok()) {
// we simply fall back to binary search in case there is any
// problem with prefix hash index loading.
ROCKS_LOG_WARN(rep_->ioptions.logger,
"Unable to read the metaindex block."
" Fall back to binary search index.");
should_fallback = true;
}
meta_index_iter = metaindex_iter_guard.get();
}
if (should_fallback) {
@ -3162,9 +3149,9 @@ Status BlockBasedTable::CreateIndexReader(
use_cache, prefetch, pin,
lookup_context, index_reader);
} else {
return HashIndexReader::Create(this, ro, prefetch_buffer,
meta_index_iter, use_cache, prefetch,
pin, lookup_context, index_reader);
return HashIndexReader::Create(this, ro, prefetch_buffer, meta_iter,
use_cache, prefetch, pin, lookup_context,
index_reader);
}
}
default: {
@ -3357,17 +3344,17 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
if (!s.ok()) {
return s;
}
if (metaindex_iter->key() == kPropertiesBlock) {
if (metaindex_iter->key() == kPropertiesBlockName) {
out_stream << " Properties block handle: "
<< metaindex_iter->value().ToString(true) << "\n";
} else if (metaindex_iter->key() == kCompressionDictBlock) {
} else if (metaindex_iter->key() == kCompressionDictBlockName) {
out_stream << " Compression dictionary block handle: "
<< metaindex_iter->value().ToString(true) << "\n";
} else if (strstr(metaindex_iter->key().ToString().c_str(),
"filter.rocksdb.") != nullptr) {
out_stream << " Filter block handle: "
<< metaindex_iter->value().ToString(true) << "\n";
} else if (metaindex_iter->key() == kRangeDelBlock) {
} else if (metaindex_iter->key() == kRangeDelBlockName) {
out_stream << " Range deletion block handle: "
<< metaindex_iter->value().ToString(true) << "\n";
}

@ -20,6 +20,7 @@
#include "table/block_based/filter_block.h"
#include "table/block_based/uncompression_dict_reader.h"
#include "table/format.h"
#include "table/persistent_cache_options.h"
#include "table/table_properties_internal.h"
#include "table/table_reader.h"
#include "table/two_level_iterator.h"

@ -3,15 +3,15 @@
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "table/block_based/partitioned_filter_block.h"
#include <map>
#include "index_builder.h"
#include "rocksdb/filter_policy.h"
#include "table/block_based/block_based_table_reader.h"
#include "table/block_based/partitioned_filter_block.h"
#include "table/block_based/filter_policy_internal.h"
#include "index_builder.h"
#include "table/format.h"
#include "test_util/testharness.h"
#include "test_util/testutil.h"
#include "util/coding.h"
@ -292,10 +292,11 @@ class PartitionedFilterBlockTest
}
};
INSTANTIATE_TEST_CASE_P(FormatDef, PartitionedFilterBlockTest,
testing::Values(test::kDefaultFormatVersion));
INSTANTIATE_TEST_CASE_P(FormatLatest, PartitionedFilterBlockTest,
testing::Values(test::kLatestFormatVersion));
// Format versions potentially intersting to partitioning
INSTANTIATE_TEST_CASE_P(FormatVersions, PartitionedFilterBlockTest,
testing::ValuesIn(std::set<uint32_t>{
2, 3, 4, test::kDefaultFormatVersion,
kLatestFormatVersion}));
TEST_P(PartitionedFilterBlockTest, EmptyBuilder) {
std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());

@ -32,9 +32,9 @@ inline void BlockFetcher::ProcessTrailerIfPresent() {
if (footer_.GetBlockTrailerSize() > 0) {
assert(footer_.GetBlockTrailerSize() == BlockBasedTable::kBlockTrailerSize);
if (read_options_.verify_checksums) {
io_status_ = status_to_io_status(
VerifyBlockChecksum(footer_.checksum(), slice_.data(), block_size_,
file_->file_name(), handle_.offset()));
io_status_ = status_to_io_status(VerifyBlockChecksum(
footer_.checksum_type(), slice_.data(), block_size_,
file_->file_name(), handle_.offset()));
}
compression_type_ =
BlockBasedTable::GetBlockCompressionType(slice_.data(), block_size_);
@ -315,7 +315,7 @@ IOStatus BlockFetcher::ReadBlockContents() {
UncompressionContext context(compression_type_);
UncompressionInfo info(context, uncompression_dict_, compression_type_);
io_status_ = status_to_io_status(UncompressBlockContents(
info, slice_.data(), block_size_, contents_, footer_.version(),
info, slice_.data(), block_size_, contents_, footer_.format_version(),
ioptions_, memory_allocator_));
#ifndef NDEBUG
num_heap_buf_memcpy_++;

@ -12,6 +12,7 @@
#include "table/block_based/block.h"
#include "table/block_based/block_type.h"
#include "table/format.h"
#include "table/persistent_cache_options.h"
namespace ROCKSDB_NAMESPACE {

@ -381,7 +381,7 @@ Status CuckooTableBuilder::Finish() {
return status_;
}
meta_index_builder.Add(kPropertiesBlock, property_block_handle);
meta_index_builder.Add(kPropertiesBlockName, property_block_handle);
Slice meta_index_block = meta_index_builder.Finish();
BlockHandle meta_index_block_handle;
@ -393,11 +393,14 @@ Status CuckooTableBuilder::Finish() {
return status_;
}
Footer footer(kCuckooTableMagicNumber, 1);
footer.set_metaindex_handle(meta_index_block_handle);
footer.set_index_handle(BlockHandle::NullBlockHandle());
Footer footer;
footer.set_table_magic_number(kCuckooTableMagicNumber)
.set_format_version(1)
.set_metaindex_handle(meta_index_block_handle)
.set_index_handle(BlockHandle::NullBlockHandle())
.set_checksum_type(kNoChecksum);
std::string footer_encoding;
footer.EncodeTo(&footer_encoding);
footer.EncodeTo(&footer_encoding, offset);
io_status_ = file_->Append(footer_encoding);
status_ = io_status_;
return status_;

@ -20,9 +20,11 @@
#include "options/options_helper.h"
#include "rocksdb/env.h"
#include "rocksdb/options.h"
#include "rocksdb/table.h"
#include "table/block_based/block.h"
#include "table/block_based/block_based_table_reader.h"
#include "table/persistent_cache_helper.h"
#include "util/cast_util.h"
#include "util/coding.h"
#include "util/compression.h"
#include "util/crc32c.h"
@ -58,6 +60,15 @@ void BlockHandle::EncodeTo(std::string* dst) const {
PutVarint64Varint64(dst, offset_, size_);
}
char* BlockHandle::EncodeTo(char* dst) const {
// Sanity check that all fields have been set
assert(offset_ != ~uint64_t{0});
assert(size_ != ~uint64_t{0});
char* cur = EncodeVarint64(dst, offset_);
cur = EncodeVarint64(cur, size_);
return cur;
}
Status BlockHandle::DecodeFrom(Slice* input) {
if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
return Status::OK();
@ -166,8 +177,8 @@ inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
}
} // namespace
void Footer::set_table_magic_number(uint64_t magic_number) {
assert(!HasInitializedTableMagicNumber());
Footer& Footer::set_table_magic_number(uint64_t magic_number) {
assert(table_magic_number_ == kNullTableMagicNumber);
table_magic_number_ = magic_number;
if (magic_number == kBlockBasedTableMagicNumber ||
magic_number == kLegacyBlockBasedTableMagicNumber) {
@ -176,64 +187,80 @@ void Footer::set_table_magic_number(uint64_t magic_number) {
} else {
block_trailer_size_ = 0;
}
return *this;
}
// legacy footer format:
// metaindex handle (varint64 offset, varint64 size)
// index handle (varint64 offset, varint64 size)
// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
// table_magic_number (8 bytes)
// new footer format:
// checksum type (char, 1 byte)
// metaindex handle (varint64 offset, varint64 size)
// index handle (varint64 offset, varint64 size)
// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
// footer version (4 bytes)
// table_magic_number (8 bytes)
void Footer::EncodeTo(std::string* dst) const {
assert(HasInitializedTableMagicNumber());
if (IsLegacyFooterFormat(table_magic_number())) {
// has to be default checksum with legacy footer
assert(checksum_ == kCRC32c);
const size_t original_size = dst->size();
metaindex_handle_.EncodeTo(dst);
index_handle_.EncodeTo(dst);
dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding
PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
assert(dst->size() == original_size + kVersion0EncodedLength);
// Footer format, in three parts:
// * Part1
// -> format_version == 0 (inferred from legacy magic number)
// <empty> (0 bytes)
// -> format_version >= 1
// checksum type (char, 1 byte)
// * Part2
// metaindex handle (varint64 offset, varint64 size)
// index handle (varint64 offset, varint64 size)
// <zero padding> for part2 size = 2 * BlockHandle::kMaxEncodedLength = 40
// * Part3
// -> format_version == 0 (inferred from legacy magic number)
// legacy magic number (8 bytes)
// -> format_version >= 1 (inferred from NOT legacy magic number)
// format_version (uint32LE, 4 bytes), also called "footer version"
// newer magic number (8 bytes)
void Footer::EncodeTo(std::string* dst, uint64_t footer_offset) const {
(void)footer_offset; // Future use
// Sanitize magic numbers & format versions
assert(table_magic_number_ != kNullTableMagicNumber);
uint64_t magic = table_magic_number_;
uint32_t fv = format_version_;
assert(fv != kInvalidFormatVersion);
assert(IsLegacyFooterFormat(magic) == (fv == 0));
ChecksumType ct = checksum_type();
// Allocate destination data and generate parts 1 and 3
const size_t original_size = dst->size();
char* part2;
if (fv > 0) {
dst->resize(original_size + kNewVersionsEncodedLength);
char* part1 = &(*dst)[original_size];
part2 = part1 + 1;
char* part3 = part2 + 2 * BlockHandle::kMaxEncodedLength;
assert(&(*dst)[dst->size() - 1] + 1 - part3 == /* part 3 size */ 12);
// Generate parts 1 and 3
part1[0] = ct;
EncodeFixed32(part3, fv);
EncodeFixed64(part3 + 4, magic);
} else {
const size_t original_size = dst->size();
dst->push_back(static_cast<char>(checksum_));
metaindex_handle_.EncodeTo(dst);
index_handle_.EncodeTo(dst);
dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding
PutFixed32(dst, version());
PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
assert(dst->size() == original_size + kNewVersionsEncodedLength);
dst->resize(original_size + kVersion0EncodedLength);
part2 = &(*dst)[original_size];
char* part3 = part2 + 2 * BlockHandle::kMaxEncodedLength;
assert(&(*dst)[dst->size() - 1] + 1 - part3 == /* part 3 size */ 8);
// Legacy SST files use kCRC32c checksum but it's not stored in footer.
assert(ct == kNoChecksum || ct == kCRC32c);
// Generate part 3 (part 1 empty)
EncodeFixed64(part3, magic);
}
}
Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
: version_(_version),
checksum_(kCRC32c),
table_magic_number_(_table_magic_number) {
// This should be guaranteed by constructor callers
assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
// Generate Part2
// Variable size encode handles (sigh)
part2 = metaindex_handle_.EncodeTo(part2);
/*part2 = */ index_handle_.EncodeTo(part2);
// remainder of part2 is already zero padded
}
Status Footer::DecodeFrom(Slice* input) {
assert(!HasInitializedTableMagicNumber());
Status Footer::DecodeFrom(Slice* input, uint64_t input_offset) {
(void)input_offset; // Future use
// Only decode to unused Footer
assert(table_magic_number_ == kNullTableMagicNumber);
assert(input != nullptr);
assert(input->size() >= kMinEncodedLength);
const char* magic_ptr =
input->data() + input->size() - kMagicNumberLengthByte;
const uint32_t magic_lo = DecodeFixed32(magic_ptr);
const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
(static_cast<uint64_t>(magic_lo)));
uint64_t magic = DecodeFixed64(magic_ptr);
// We check for legacy formats here and silently upconvert them
bool legacy = IsLegacyFooterFormat(magic);
@ -242,44 +269,51 @@ Status Footer::DecodeFrom(Slice* input) {
}
set_table_magic_number(magic);
// Parse Part3
if (legacy) {
// The size is already asserted to be at least kMinEncodedLength
// at the beginning of the function
input->remove_prefix(input->size() - kVersion0EncodedLength);
version_ = 0 /* legacy */;
checksum_ = kCRC32c;
format_version_ = 0 /* legacy */;
checksum_type_ = kCRC32c;
} else {
version_ = DecodeFixed32(magic_ptr - 4);
// Footer version 1 and higher will always occupy exactly this many bytes.
// It consists of the checksum type, two block handles, padding,
// a version number, and a magic number
if (input->size() < kNewVersionsEncodedLength) {
return Status::Corruption("input is too short to be an sstable");
} else {
input->remove_prefix(input->size() - kNewVersionsEncodedLength);
const char* part3_ptr = magic_ptr - 4;
format_version_ = DecodeFixed32(part3_ptr);
if (!IsSupportedFormatVersion(format_version_)) {
return Status::Corruption("Corrupt or unsupported format_version: " +
ROCKSDB_NAMESPACE::ToString(format_version_));
}
uint32_t chksum;
if (!GetVarint32(input, &chksum)) {
return Status::Corruption("bad checksum type");
// All known format versions >= 1 occupy exactly this many bytes.
if (input->size() < kNewVersionsEncodedLength) {
return Status::Corruption("Input is too short to be an SST file");
}
checksum_ = static_cast<ChecksumType>(chksum);
if (chksum != static_cast<uint32_t>(checksum_) ||
!IsSupportedChecksumType(checksum_)) {
return Status::Corruption("unknown checksum type " +
ROCKSDB_NAMESPACE::ToString(chksum));
uint64_t adjustment = input->size() - kNewVersionsEncodedLength;
input->remove_prefix(adjustment);
// Parse Part1
char chksum = input->data()[0];
checksum_type_ = lossless_cast<ChecksumType>(chksum);
if (!IsSupportedChecksumType(checksum_type())) {
return Status::Corruption(
"Corrupt or unsupported checksum type: " +
ROCKSDB_NAMESPACE::ToString(lossless_cast<uint8_t>(chksum)));
}
// Consume checksum type field
input->remove_prefix(1);
}
// Parse Part2
Status result = metaindex_handle_.DecodeFrom(input);
if (result.ok()) {
result = index_handle_.DecodeFrom(input);
}
if (result.ok()) {
// We skip over any leftover data (just padding for now) in "input"
const char* end = magic_ptr + kMagicNumberLengthByte;
*input = Slice(end, input->data() + input->size() - end);
if (!result.ok()) {
return result;
}
return result;
// Mark all input consumed (skip padding & part3)
*input = Slice(input->data() + input->size(), 0U);
return Status::OK();
}
std::string Footer::ToString() const {
@ -293,14 +327,12 @@ std::string Footer::ToString() const {
result.append("table_magic_number: " +
ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n ");
} else {
result.append("checksum: " + ROCKSDB_NAMESPACE::ToString(checksum_) +
"\n ");
result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
result.append("index handle: " + index_handle_.ToString() + "\n ");
result.append("footer version: " + ROCKSDB_NAMESPACE::ToString(version_) +
"\n ");
result.append("table_magic_number: " +
ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n ");
result.append("format version: " +
ROCKSDB_NAMESPACE::ToString(format_version_) + "\n ");
}
return result;
}
@ -319,10 +351,9 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
std::string footer_buf;
AlignedBuf internal_buf;
Slice footer_input;
size_t read_offset =
(file_size > Footer::kMaxEncodedLength)
? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
: 0;
uint64_t read_offset = (file_size > Footer::kMaxEncodedLength)
? file_size - Footer::kMaxEncodedLength
: 0;
Status s;
// TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now,
// there is no readahead for point lookups, so TryReadFromCache will fail if
@ -353,7 +384,7 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
file->file_name());
}
s = footer->DecodeFrom(&footer_input);
s = footer->DecodeFrom(&footer_input, read_offset);
if (!s.ok()) {
return s;
}
@ -376,7 +407,7 @@ inline uint32_t ModifyChecksumForLastByte(uint32_t checksum, char last_byte) {
// more byte, except we don't need to re-mix the input checksum as long as
// we do this step only once (per checksum).
const uint32_t kRandomPrime = 0x6b9083d9;
return checksum ^ static_cast<uint8_t>(last_byte) * kRandomPrime;
return checksum ^ lossless_cast<uint8_t>(last_byte) * kRandomPrime;
}
} // namespace

@ -8,21 +8,20 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <stdint.h>
#include <cstdint>
#include <string>
#include "file/file_prefetch_buffer.h"
#include "file/random_access_file_reader.h"
#include "rocksdb/options.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
#include "memory/memory_allocator.h"
#include "options/cf_options.h"
#include "port/malloc.h"
#include "port/port.h" // noexcept
#include "table/persistent_cache_options.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
#include "util/hash.h"
namespace ROCKSDB_NAMESPACE {
@ -32,7 +31,7 @@ struct ReadOptions;
extern bool ShouldReportDetailedTime(Env* env, Statistics* stats);
// the length of the magic number in bytes.
const int kMagicNumberLengthByte = 8;
constexpr uint32_t kMagicNumberLengthByte = 8;
// BlockHandle is a pointer to the extent of a file that stores a data
// block or a meta block.
@ -52,6 +51,7 @@ class BlockHandle {
void set_size(uint64_t _size) { size_ = _size; }
void EncodeTo(std::string* dst) const;
char* EncodeTo(char* dst) const;
Status DecodeFrom(Slice* input);
Status DecodeSizeFrom(uint64_t offset, Slice* input);
@ -65,7 +65,7 @@ class BlockHandle {
static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }
// Maximum encoding length of a BlockHandle
enum { kMaxEncodedLength = 10 + 10 };
static constexpr uint32_t kMaxEncodedLength = 2 * kMaxVarint64Length;
inline bool operator==(const BlockHandle& rhs) const {
return offset_ == rhs.offset_ && size_ == rhs.size_;
@ -117,94 +117,107 @@ inline uint32_t GetCompressFormatForVersion(uint32_t format_version) {
return format_version >= 2 ? 2 : 1;
}
inline bool BlockBasedTableSupportedVersion(uint32_t version) {
return version <= 5;
constexpr uint32_t kLatestFormatVersion = 5;
inline bool IsSupportedFormatVersion(uint32_t version) {
return version <= kLatestFormatVersion;
}
// Footer encapsulates the fixed information stored at the tail
// end of every table file.
// Footer encapsulates the fixed information stored at the tail end of every
// SST file. In general, it should only include things that cannot go
// elsewhere under the metaindex block. For example, checksum_type is
// required for verifying metaindex block checksum (when applicable), but
// index block handle can easily go in metaindex block (possible future).
class Footer {
public:
// Constructs a footer without specifying its table magic number.
// In such case, the table magic number of such footer should be
// initialized via @ReadFooterFromFile().
// Use this when you plan to load Footer with DecodeFrom(). Never use this
// when you plan to EncodeTo.
Footer() : Footer(kInvalidTableMagicNumber, 0) {}
// Use this constructor when you plan to write out the footer using
// EncodeTo(). Never use this constructor with DecodeFrom().
// `version` is same as `format_version` for block-based table.
Footer(uint64_t table_magic_number, uint32_t version);
// The version of the footer in this file
uint32_t version() const { return version_; }
// The checksum type used in this file
ChecksumType checksum() const { return checksum_; }
void set_checksum(const ChecksumType c) { checksum_ = c; }
// The block handle for the metaindex block of the table
Footer() {}
// Uses builder pattern rather than distinctive ctors
// Table magic number identifies file as RocksDB SST file and which kind of
// SST format is use.
Footer& set_table_magic_number(uint64_t tmn);
uint64_t table_magic_number() const { return table_magic_number_; }
// A version (footer and more) within a kind of SST. (It would add more
// unnecessary complexity to separate footer versions and
// BBTO::format_version.)
Footer& set_format_version(uint32_t fv) {
format_version_ = fv;
return *this;
}
uint32_t format_version() const { return format_version_; }
// Block handle for metaindex block.
Footer& set_metaindex_handle(const BlockHandle& h) {
metaindex_handle_ = h;
return *this;
}
const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; }
// The block handle for the index block of the table
// Block handle for (top-level) index block.
Footer& set_index_handle(const BlockHandle& h) {
index_handle_ = h;
return *this;
}
const BlockHandle& index_handle() const { return index_handle_; }
void set_index_handle(const BlockHandle& h) { index_handle_ = h; }
uint64_t table_magic_number() const { return table_magic_number_; }
// Checksum type used in the file.
Footer& set_checksum_type(ChecksumType ct) {
checksum_type_ = ct;
return *this;
}
ChecksumType checksum_type() const {
return static_cast<ChecksumType>(checksum_type_);
}
void EncodeTo(std::string* dst) const;
// Appends serialized footer to `dst`. The starting offset of the footer
// within the file is required for future work.
void EncodeTo(std::string* dst, uint64_t footer_offset) const;
// Set the current footer based on the input slice.
//
// REQUIRES: table_magic_number_ is not set (i.e.,
// HasInitializedTableMagicNumber() is true). The function will initialize the
// magic number
Status DecodeFrom(Slice* input);
// Deserialize a footer (populate fields) from `input` and check for various
// corruptions. On success (and some error cases) `input` is advanced past
// the footer. Like EncodeTo, the offset within the file will be nedded for
// future work
Status DecodeFrom(Slice* input, uint64_t input_offset);
// Encoded length of a Footer. Note that the serialization of a Footer will
// always occupy at least kMinEncodedLength bytes. If fields are changed
// the version number should be incremented and kMaxEncodedLength should be
// increased accordingly.
enum {
// Footer version 0 (legacy) will always occupy exactly this many bytes.
// It consists of two block handles, padding, and a magic number.
kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8,
// Footer of versions 1 and higher will always occupy exactly this many
// bytes. It consists of the checksum type, two block handles, padding,
// a version number (bigger than 1), and a magic number
kNewVersionsEncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8,
kMinEncodedLength = kVersion0EncodedLength,
kMaxEncodedLength = kNewVersionsEncodedLength,
};
static const uint64_t kInvalidTableMagicNumber = 0;
// convert this object to a human readable form
// Convert this object to a human readable form
std::string ToString() const;
// Block trailer size used by file with this footer (e.g. 5 for block-based
// table and 0 for plain table)
inline size_t GetBlockTrailerSize() const { return block_trailer_size_; }
private:
// REQUIRES: magic number wasn't initialized.
void set_table_magic_number(uint64_t magic_number);
// Encoded lengths of Footers. Bytes for serialized Footer will always be
// >= kMinEncodedLength and <= kMaxEncodedLength.
//
// Footer version 0 (legacy) will always occupy exactly this many bytes.
// It consists of two block handles, padding, and a magic number.
static constexpr uint32_t kVersion0EncodedLength =
2 * BlockHandle::kMaxEncodedLength + kMagicNumberLengthByte;
static constexpr uint32_t kMinEncodedLength = kVersion0EncodedLength;
// Footer of versions 1 and higher will always occupy exactly this many
// bytes. It originally consisted of the checksum type, two block handles,
// padding (to maximum handle encoding size), a format version number, and a
// magic number.
static constexpr uint32_t kNewVersionsEncodedLength =
1 + 2 * BlockHandle::kMaxEncodedLength + 4 + kMagicNumberLengthByte;
static constexpr uint32_t kMaxEncodedLength = kNewVersionsEncodedLength;
static constexpr uint64_t kNullTableMagicNumber = 0;
// return true if @table_magic_number_ is set to a value different
// from @kInvalidTableMagicNumber.
bool HasInitializedTableMagicNumber() const {
return (table_magic_number_ != kInvalidTableMagicNumber);
}
private:
static constexpr uint32_t kInvalidFormatVersion = 0xffffffffU;
static constexpr int kInvalidChecksumType =
(1 << (sizeof(ChecksumType) * 8)) | kNoChecksum;
uint32_t version_;
ChecksumType checksum_;
uint8_t block_trailer_size_ = 0; // set based on magic number
uint64_t table_magic_number_ = kNullTableMagicNumber;
uint32_t format_version_ = kInvalidFormatVersion;
BlockHandle metaindex_handle_;
BlockHandle index_handle_;
uint64_t table_magic_number_ = 0;
int checksum_type_ = kInvalidChecksumType;
uint8_t block_trailer_size_ = 0; // set based on magic number
};
// Read the footer from file

@ -26,11 +26,11 @@
namespace ROCKSDB_NAMESPACE {
const std::string kPropertiesBlock = "rocksdb.properties";
const std::string kPropertiesBlockName = "rocksdb.properties";
// Old property block name for backward compatibility
const std::string kPropertiesBlockOldName = "rocksdb.stats";
const std::string kCompressionDictBlock = "rocksdb.compression_dict";
const std::string kRangeDelBlock = "rocksdb.range_del";
const std::string kCompressionDictBlockName = "rocksdb.compression_dict";
const std::string kRangeDelBlockName = "rocksdb.range_del";
MetaIndexBuilder::MetaIndexBuilder()
: meta_index_block_(new BlockBuilder(1 /* restart interval */)) {}
@ -381,7 +381,7 @@ Status ReadTablePropertiesHelper(
// Modified version of BlockFetcher checksum verification
// (See write_global_seqno comment above)
if (s.ok() && footer.GetBlockTrailerSize() > 0) {
s = VerifyBlockChecksum(footer.checksum(), properties_block.data(),
s = VerifyBlockChecksum(footer.checksum_type(), properties_block.data(),
block_size, file->file_name(), handle.offset());
if (s.IsCorruption()) {
if (new_table_properties->external_sst_file_global_seqno_offset != 0) {
@ -391,8 +391,8 @@ Status ReadTablePropertiesHelper(
new_table_properties->external_sst_file_global_seqno_offset -
handle.offset();
EncodeFixed64(&tmp_buf[static_cast<size_t>(global_seqno_offset)], 0);
s = VerifyBlockChecksum(footer.checksum(), tmp_buf.data(), block_size,
file->file_name(), handle.offset());
s = VerifyBlockChecksum(footer.checksum_type(), tmp_buf.data(),
block_size, file->file_name(), handle.offset());
}
}
}
@ -413,7 +413,7 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
BlockHandle block_handle;
Footer footer;
Status s = FindMetaBlockInFile(file, file_size, table_magic_number, ioptions,
kPropertiesBlock, &block_handle,
kPropertiesBlockName, &block_handle,
memory_allocator, prefetch_buffer, &footer);
if (!s.ok()) {
return s;
@ -438,7 +438,7 @@ Status FindOptionalMetaBlock(InternalIterator* meta_index_iter,
if (meta_index_iter->Valid() && meta_index_iter->key() == meta_block_name) {
Slice v = meta_index_iter->value();
return block_handle->DecodeFrom(&v);
} else if (meta_block_name == kPropertiesBlock) {
} else if (meta_block_name == kPropertiesBlockName) {
// Have to try old name for compatibility
meta_index_iter->Seek(kPropertiesBlockOldName);
if (meta_index_iter->status().ok() && meta_index_iter->Valid() &&

@ -31,10 +31,10 @@ class RandomAccessFile;
struct TableProperties;
// Meta block names for metaindex
extern const std::string kPropertiesBlock;
extern const std::string kPropertiesBlockName;
extern const std::string kPropertiesBlockOldName;
extern const std::string kCompressionDictBlock;
extern const std::string kRangeDelBlock;
extern const std::string kCompressionDictBlockName;
extern const std::string kRangeDelBlockName;
class MetaIndexBuilder {
public:

@ -279,7 +279,7 @@ Status PlainTableBuilder::Finish() {
if (!s.ok()) {
return std::move(s);
}
meta_index_builer.Add(kPropertiesBlock, property_block_handle);
meta_index_builer.Add(kPropertiesBlockName, property_block_handle);
// -- write metaindex block
BlockHandle metaindex_block_handle;
@ -292,11 +292,13 @@ Status PlainTableBuilder::Finish() {
// Write Footer
// no need to write out new footer if we're using default checksum
Footer footer(kLegacyPlainTableMagicNumber, 0);
footer.set_metaindex_handle(metaindex_block_handle);
footer.set_index_handle(BlockHandle::NullBlockHandle());
Footer footer;
footer.set_table_magic_number(kLegacyPlainTableMagicNumber)
.set_format_version(0)
.set_metaindex_handle(metaindex_block_handle)
.set_index_handle(BlockHandle::NullBlockHandle());
std::string footer_encoding;
footer.EncodeTo(&footer_encoding);
footer.EncodeTo(&footer_encoding, offset_);
io_status_ = file_->Append(footer_encoding);
if (io_status_.ok()) {
offset_ += footer_encoding.size();

@ -74,7 +74,7 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
// Warning about 'magic_number' being uninitialized shows up only in UBsan
// builds. Though access is guarded by 's.ok()' checks, fix the issue to
// avoid any warnings.
uint64_t magic_number = Footer::kInvalidTableMagicNumber;
uint64_t magic_number = Footer::kNullTableMagicNumber;
// read table magic number
Footer footer;

@ -21,16 +21,15 @@
#include <unordered_set>
#include <vector>
#include "block_fetcher.h"
#include "cache/lru_cache.h"
#include "db/dbformat.h"
#include "db/memtable.h"
#include "db/write_batch_internal.h"
#include "memtable/stl_wrappers.h"
#include "meta_blocks.h"
#include "monitoring/statistics.h"
#include "options/options_helper.h"
#include "port/port.h"
#include "port/stack_trace.h"
#include "rocksdb/cache.h"
#include "rocksdb/compression_type.h"
#include "rocksdb/db.h"
@ -53,9 +52,11 @@
#include "table/block_based/block_based_table_reader.h"
#include "table/block_based/block_builder.h"
#include "table/block_based/flush_block_policy.h"
#include "table/block_fetcher.h"
#include "table/format.h"
#include "table/get_context.h"
#include "table/internal_iterator.h"
#include "table/meta_blocks.h"
#include "table/plain/plain_table_factory.h"
#include "table/scoped_arena_iterator.h"
#include "table/sst_file_writer_collectors.h"
@ -1356,10 +1357,8 @@ class FileChecksumTestHelper {
uint64_t FileChecksumTestHelper::checksum_uniq_id_ = 1;
INSTANTIATE_TEST_CASE_P(FormatDef, BlockBasedTableTest,
testing::Values(test::kDefaultFormatVersion));
INSTANTIATE_TEST_CASE_P(FormatLatest, BlockBasedTableTest,
testing::Values(test::kLatestFormatVersion));
INSTANTIATE_TEST_CASE_P(FormatVersions, BlockBasedTableTest,
testing::ValuesIn(test::kFooterFormatVersionsToTest));
// This test serves as the living tutorial for the prefix scan of user collected
// properties.
@ -2228,7 +2227,8 @@ TEST_P(BlockBasedTableTest, BadChecksumType) {
const MutableCFOptions new_moptions(options);
Status s = c.Reopen(new_ioptions, new_moptions);
ASSERT_NOK(s);
ASSERT_MATCHES_REGEX(s.ToString(), "Corruption: unknown checksum type 123.*");
ASSERT_EQ(s.ToString(),
"Corruption: Corrupt or unsupported checksum type: 123");
}
namespace {
@ -4166,106 +4166,107 @@ TEST_P(ParameterizedHarnessTest, SimpleSpecialKey) {
}
TEST(TableTest, FooterTests) {
Random* r = Random::GetTLSInstance();
uint64_t data_size = (uint64_t{1} << r->Uniform(40)) + r->Uniform(100);
uint64_t index_size = r->Uniform(1000000000);
uint64_t metaindex_size = r->Uniform(1000000);
// 5 == block trailer size
BlockHandle index(data_size + 5, index_size);
BlockHandle meta_index(data_size + index_size + 2 * 5, metaindex_size);
uint64_t footer_offset = data_size + metaindex_size + index_size + 3 * 5;
{
// upconvert legacy block based
std::string encoded;
Footer footer(kLegacyBlockBasedTableMagicNumber, 0);
BlockHandle meta_index(10, 5), index(20, 15);
footer.set_metaindex_handle(meta_index);
footer.set_index_handle(index);
footer.EncodeTo(&encoded);
Footer footer;
footer.set_table_magic_number(kLegacyBlockBasedTableMagicNumber)
.set_format_version(0)
.set_metaindex_handle(meta_index)
.set_index_handle(index);
footer.EncodeTo(&encoded, footer_offset);
Footer decoded_footer;
Slice encoded_slice(encoded);
ASSERT_OK(decoded_footer.DecodeFrom(&encoded_slice));
ASSERT_OK(decoded_footer.DecodeFrom(&encoded_slice, footer_offset));
ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.version(), 0U);
ASSERT_EQ(decoded_footer.format_version(), 0U);
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
}
// block based, various checksums, various versions
for (auto t : GetSupportedChecksums()) {
// block based, various checksums
std::string encoded;
Footer footer(kBlockBasedTableMagicNumber, 1);
BlockHandle meta_index(10, 5), index(20, 15);
footer.set_metaindex_handle(meta_index);
footer.set_index_handle(index);
footer.set_checksum(t);
footer.EncodeTo(&encoded);
Footer decoded_footer;
Slice encoded_slice(encoded);
ASSERT_OK(decoded_footer.DecodeFrom(&encoded_slice));
ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
ASSERT_EQ(decoded_footer.checksum(), t);
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.version(), 1U);
for (uint32_t fv = 1; IsSupportedFormatVersion(fv); ++fv) {
std::string encoded;
Footer footer;
footer.set_table_magic_number(kBlockBasedTableMagicNumber)
.set_format_version(fv)
.set_metaindex_handle(meta_index)
.set_index_handle(index)
.set_checksum_type(t);
footer.EncodeTo(&encoded, footer_offset);
Footer decoded_footer;
Slice encoded_slice(encoded);
ASSERT_OK(decoded_footer.DecodeFrom(&encoded_slice, footer_offset));
ASSERT_EQ(decoded_footer.table_magic_number(),
kBlockBasedTableMagicNumber);
ASSERT_EQ(decoded_footer.checksum_type(), t);
ASSERT_EQ(decoded_footer.metaindex_handle().offset(),
meta_index.offset());
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.format_version(), fv);
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
}
}
// Plain table is not supported in ROCKSDB_LITE
#ifndef ROCKSDB_LITE
{
// upconvert legacy plain table
std::string encoded;
Footer footer(kLegacyPlainTableMagicNumber, 0);
BlockHandle meta_index(10, 5), index(20, 15);
footer.set_metaindex_handle(meta_index);
footer.set_index_handle(index);
footer.EncodeTo(&encoded);
Footer footer;
footer.set_table_magic_number(kLegacyPlainTableMagicNumber)
.set_format_version(0)
.set_metaindex_handle(meta_index)
.set_index_handle(index);
footer.EncodeTo(&encoded, footer_offset);
Footer decoded_footer;
Slice encoded_slice(encoded);
ASSERT_OK(decoded_footer.DecodeFrom(&encoded_slice));
ASSERT_OK(decoded_footer.DecodeFrom(&encoded_slice, footer_offset));
ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.version(), 0U);
ASSERT_EQ(decoded_footer.format_version(), 0U);
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
}
{
// xxhash plain table (not currently used)
std::string encoded;
Footer footer(kPlainTableMagicNumber, 1);
BlockHandle meta_index(10, 5), index(20, 15);
footer.set_metaindex_handle(meta_index);
footer.set_index_handle(index);
footer.set_checksum(kxxHash);
footer.EncodeTo(&encoded);
Footer footer;
footer.set_table_magic_number(kPlainTableMagicNumber)
.set_format_version(1)
.set_metaindex_handle(meta_index)
.set_index_handle(index)
.set_checksum_type(kxxHash);
footer.EncodeTo(&encoded, footer_offset);
Footer decoded_footer;
Slice encoded_slice(encoded);
ASSERT_OK(decoded_footer.DecodeFrom(&encoded_slice));
ASSERT_OK(decoded_footer.DecodeFrom(&encoded_slice, footer_offset));
ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
ASSERT_EQ(decoded_footer.checksum(), kxxHash);
ASSERT_EQ(decoded_footer.checksum_type(), kxxHash);
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.version(), 1U);
ASSERT_EQ(decoded_footer.format_version(), 1U);
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
}
#endif // !ROCKSDB_LITE
{
// version == 2
std::string encoded;
Footer footer(kBlockBasedTableMagicNumber, 2);
BlockHandle meta_index(10, 5), index(20, 15);
footer.set_metaindex_handle(meta_index);
footer.set_index_handle(index);
footer.EncodeTo(&encoded);
Footer decoded_footer;
Slice encoded_slice(encoded);
ASSERT_OK(decoded_footer.DecodeFrom(&encoded_slice));
ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.version(), 2U);
}
}
class IndexBlockRestartIntervalTest
@ -4786,7 +4787,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
// -- Read properties block
BlockHandle properties_handle;
ASSERT_OK(FindOptionalMetaBlock(meta_iter.get(), kPropertiesBlock,
ASSERT_OK(FindOptionalMetaBlock(meta_iter.get(), kPropertiesBlockName,
&properties_handle));
ASSERT_FALSE(properties_handle.IsNull());
BlockContents properties_contents;
@ -4873,7 +4874,7 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
key_at_max_offset = metaindex_iter->key().ToString();
}
}
ASSERT_EQ(kPropertiesBlock, key_at_max_offset);
ASSERT_EQ(kPropertiesBlockName, key_at_max_offset);
// index handle is stored in footer rather than metaindex block, so need
// separate logic to verify it comes before properties block.
ASSERT_GT(max_offset, footer.index_handle().offset());
@ -5369,6 +5370,7 @@ TEST_P(
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

@ -38,7 +38,12 @@ namespace ROCKSDB_NAMESPACE {
namespace test {
const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version;
const uint32_t kLatestFormatVersion = 5u;
const std::set<uint32_t> kFooterFormatVersionsToTest{
5U,
// In case any interesting future changes
kDefaultFormatVersion,
kLatestFormatVersion,
};
std::string RandomKey(Random* rnd, int len, RandomKeyType type) {
// Make sure to generate a wide variety of characters so we

@ -44,7 +44,7 @@ class SequentialFileReader;
namespace test {
extern const uint32_t kDefaultFormatVersion;
extern const uint32_t kLatestFormatVersion;
extern const std::set<uint32_t> kFooterFormatVersionsToTest;
// Return a random key with the specified length that may contain interesting
// characters (e.g. \x00, \xff, etc.).

@ -5,6 +5,8 @@
#pragma once
#include <type_traits>
#include "rocksdb/rocksdb_namespace.h"
namespace ROCKSDB_NAMESPACE {
@ -20,4 +22,21 @@ inline DestClass* static_cast_with_check(SrcClass* x) {
#endif
return ret;
}
// A wrapper around static_cast for lossless conversion between integral
// types, including enum types. For example, this can be used for converting
// between signed/unsigned or enum type and underlying type without fear of
// stripping away data, now or in the future.
template <typename To, typename From>
inline To lossless_cast(From x) {
using FromValue = typename std::remove_reference<From>::type;
static_assert(
std::is_integral<FromValue>::value || std::is_enum<FromValue>::value,
"Only works on integral types");
static_assert(std::is_integral<To>::value || std::is_enum<To>::value,
"Only works on integral types");
static_assert(sizeof(To) >= sizeof(FromValue), "Must be lossless");
return static_cast<To>(x);
}
} // namespace ROCKSDB_NAMESPACE

@ -31,7 +31,7 @@
namespace ROCKSDB_NAMESPACE {
// The maximum length of a varint in bytes for 64-bit.
const unsigned int kMaxVarint64Length = 10;
const uint32_t kMaxVarint64Length = 10;
// Standard Put... routines append to a string
extern void PutFixed16(std::string* dst, uint16_t value);

Loading…
Cancel
Save