Write properties metablock last in block-based tables (#4158)

Summary:
The properties meta-block should come at the end since we always need to
read it when opening a file, unlike index/filter/other meta-blocks, which
are sometimes read depending on the user's configuration. This ordering
will allow us to (in a future PR) do a small readahead on the end of the file
to read properties and meta-index blocks with one I/O.

The bulk of this PR is a refactoring of the `BlockBasedTableBuilder::Finish`
function. It was previously too large with inconsistent error handling, which
made it difficult to change. So I broke it up into one function per meta-block
write, and tried to make error handling consistent within those functions.
Then reordering the metablocks was trivial -- just reorder the calls to these
helper functions.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4158

Differential Revision: D8921705

Pulled By: ajkr

fbshipit-source-id: 96c9cc3182eb1adf11af46adab79dbeba7b12fcc
main
Andrew Kryczka 7 years ago committed by Facebook Github Bot
parent 2736752b33
commit ab35505e21
  1. 323
      table/block_based_table_builder.cc
  2. 9
      table/block_based_table_builder.h
  3. 98
      table/table_test.cc

@ -37,7 +37,6 @@
#include "table/filter_block.h" #include "table/filter_block.h"
#include "table/format.h" #include "table/format.h"
#include "table/full_filter_block.h" #include "table/full_filter_block.h"
#include "table/meta_blocks.h"
#include "table/table_builder.h" #include "table/table_builder.h"
#include "util/string_util.h" #include "util/string_util.h"
@ -668,184 +667,204 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
return Status::OK(); return Status::OK();
} }
Status BlockBasedTableBuilder::Finish() { void BlockBasedTableBuilder::WriteFilterBlock(
Rep* r = rep_; MetaIndexBuilder* meta_index_builder) {
bool empty_data_block = r->data_block.empty(); BlockHandle filter_block_handle;
Flush(); bool empty_filter_block = (rep_->filter_builder == nullptr ||
assert(!r->closed); rep_->filter_builder->NumAdded() == 0);
r->closed = true;
// To make sure properties block is able to keep the accurate size of index
// block, we will finish writing all index entries here and flush them
// to storage after metaindex block is written.
if (ok() && !empty_data_block) {
r->index_builder->AddIndexEntry(
&r->last_key, nullptr /* no next data block */, r->pending_handle);
}
BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle,
compression_dict_block_handle, range_del_block_handle;
// Write filter block
bool empty_filter_block = (r->filter_builder == nullptr ||
r->filter_builder->NumAdded() == 0);
if (ok() && !empty_filter_block) { if (ok() && !empty_filter_block) {
Status s = Status::Incomplete(); Status s = Status::Incomplete();
while (s.IsIncomplete()) { while (ok() && s.IsIncomplete()) {
Slice filter_content = r->filter_builder->Finish(filter_block_handle, &s); Slice filter_content = rep_->filter_builder->Finish(filter_block_handle, &s);
assert(s.ok() || s.IsIncomplete()); assert(s.ok() || s.IsIncomplete());
r->props.filter_size += filter_content.size(); rep_->props.filter_size += filter_content.size();
WriteRawBlock(filter_content, kNoCompression, &filter_block_handle); WriteRawBlock(filter_content, kNoCompression, &filter_block_handle);
} }
} }
if (ok() && !empty_filter_block) {
// Add mapping from "<filter_block_prefix>.Name" to location
// of filter data.
std::string key;
if (rep_->filter_builder->IsBlockBased()) {
key = BlockBasedTable::kFilterBlockPrefix;
} else {
key = rep_->table_options.partition_filters
? BlockBasedTable::kPartitionedFilterBlockPrefix
: BlockBasedTable::kFullFilterBlockPrefix;
}
key.append(rep_->table_options.filter_policy->Name());
meta_index_builder->Add(key, filter_block_handle);
}
}
void BlockBasedTableBuilder::WriteIndexBlock(
MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle) {
IndexBuilder::IndexBlocks index_blocks; IndexBuilder::IndexBlocks index_blocks;
auto index_builder_status = r->index_builder->Finish(&index_blocks); auto index_builder_status = rep_->index_builder->Finish(&index_blocks);
if (index_builder_status.IsIncomplete()) { if (index_builder_status.IsIncomplete()) {
// We we have more than one index partition then meta_blocks are not // We we have more than one index partition then meta_blocks are not
// supported for the index. Currently meta_blocks are used only by // supported for the index. Currently meta_blocks are used only by
// HashIndexBuilder which is not multi-partition. // HashIndexBuilder which is not multi-partition.
assert(index_blocks.meta_blocks.empty()); assert(index_blocks.meta_blocks.empty());
} else if (!index_builder_status.ok()) { } else if (ok() && !index_builder_status.ok()) {
return index_builder_status; rep_->status = index_builder_status;
} }
// Write meta blocks and metaindex block with the following order.
// 1. [meta block: filter]
// 2. [meta block: properties]
// 3. [meta block: compression dictionary]
// 4. [meta block: range deletion tombstone]
// 5. [metaindex block]
// write meta blocks
MetaIndexBuilder meta_index_builder;
for (const auto& item : index_blocks.meta_blocks) {
BlockHandle block_handle;
WriteBlock(item.second, &block_handle, false /* is_data_block */);
meta_index_builder.Add(item.first, block_handle);
}
if (ok()) { if (ok()) {
if (!empty_filter_block) { for (const auto& item : index_blocks.meta_blocks) {
// Add mapping from "<filter_block_prefix>.Name" to location BlockHandle block_handle;
// of filter data. WriteBlock(item.second, &block_handle, false /* is_data_block */);
std::string key; if (!ok()) {
if (r->filter_builder->IsBlockBased()) { break;
key = BlockBasedTable::kFilterBlockPrefix;
} else {
key = r->table_options.partition_filters
? BlockBasedTable::kPartitionedFilterBlockPrefix
: BlockBasedTable::kFullFilterBlockPrefix;
} }
key.append(r->table_options.filter_policy->Name()); meta_index_builder->Add(item.first, block_handle);
meta_index_builder.Add(key, filter_block_handle); }
}
if (ok()) {
if (rep_->table_options.enable_index_compression) {
WriteBlock(index_blocks.index_block_contents, index_block_handle, false);
} else {
WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
index_block_handle);
} }
}
// If there are more index partitions, finish them and write them out
Status s = index_builder_status;
while (ok() && s.IsIncomplete()) {
s = rep_->index_builder->Finish(&index_blocks, *index_block_handle);
if (!s.ok() && !s.IsIncomplete()) {
rep_->status = s;
return;
}
if (rep_->table_options.enable_index_compression) {
WriteBlock(index_blocks.index_block_contents, index_block_handle, false);
} else {
WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
index_block_handle);
}
// The last index_block_handle will be for the partition index block
}
}
// Write properties and compression dictionary blocks. void BlockBasedTableBuilder::WritePropertiesBlock(
{ MetaIndexBuilder* meta_index_builder) {
PropertyBlockBuilder property_block_builder; BlockHandle properties_block_handle;
r->props.column_family_id = r->column_family_id; if (ok()) {
r->props.column_family_name = r->column_family_name; PropertyBlockBuilder property_block_builder;
r->props.filter_policy_name = r->table_options.filter_policy != nullptr ? rep_->props.column_family_id = rep_->column_family_id;
r->table_options.filter_policy->Name() : ""; rep_->props.column_family_name = rep_->column_family_name;
r->props.index_size = rep_->props.filter_policy_name = rep_->table_options.filter_policy != nullptr
r->index_builder->EstimatedSize() + kBlockTrailerSize; ? rep_->table_options.filter_policy->Name()
r->props.comparator_name = r->ioptions.user_comparator != nullptr : "";
? r->ioptions.user_comparator->Name() rep_->props.index_size = rep_->index_builder->EstimatedSize() + kBlockTrailerSize;
: "nullptr"; rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr
r->props.merge_operator_name = r->ioptions.merge_operator != nullptr ? rep_->ioptions.user_comparator->Name()
? r->ioptions.merge_operator->Name() : "nullptr";
rep_->props.merge_operator_name = rep_->ioptions.merge_operator != nullptr
? rep_->ioptions.merge_operator->Name()
: "nullptr";
rep_->props.compression_name =
CompressionTypeToString(rep_->compression_ctx.type());
rep_->props.prefix_extractor_name = rep_->moptions.prefix_extractor != nullptr
? rep_->moptions.prefix_extractor->Name()
: "nullptr"; : "nullptr";
r->props.compression_name =
CompressionTypeToString(r->compression_ctx.type()); std::string property_collectors_names = "[";
r->props.prefix_extractor_name = for (size_t i = 0;
r->moptions.prefix_extractor != nullptr i < rep_->ioptions.table_properties_collector_factories.size(); ++i) {
? r->moptions.prefix_extractor->Name() if (i != 0) {
: "nullptr"; property_collectors_names += ",";
std::string property_collectors_names = "[";
for (size_t i = 0;
i < r->ioptions.table_properties_collector_factories.size(); ++i) {
if (i != 0) {
property_collectors_names += ",";
}
property_collectors_names +=
r->ioptions.table_properties_collector_factories[i]->Name();
}
property_collectors_names += "]";
r->props.property_collectors_names = property_collectors_names;
if (r->table_options.index_type ==
BlockBasedTableOptions::kTwoLevelIndexSearch) {
assert(r->p_index_builder_ != nullptr);
r->props.index_partitions = r->p_index_builder_->NumPartitions();
r->props.top_level_index_size =
r->p_index_builder_->EstimateTopLevelIndexSize(r->offset);
}
r->props.index_key_is_user_key =
!r->index_builder->seperator_is_key_plus_seq();
r->props.creation_time = r->creation_time;
r->props.oldest_key_time = r->oldest_key_time;
// Add basic properties
property_block_builder.AddTableProperty(r->props);
// Add use collected properties
NotifyCollectTableCollectorsOnFinish(r->table_properties_collectors,
r->ioptions.info_log,
&property_block_builder);
BlockHandle properties_block_handle;
WriteRawBlock(
property_block_builder.Finish(),
kNoCompression,
&properties_block_handle
);
meta_index_builder.Add(kPropertiesBlock, properties_block_handle);
// Write compression dictionary block
if (r->compression_dict && r->compression_dict->size()) {
WriteRawBlock(*r->compression_dict, kNoCompression,
&compression_dict_block_handle);
meta_index_builder.Add(kCompressionDictBlock,
compression_dict_block_handle);
} }
} // end of properties/compression dictionary block writing property_collectors_names +=
rep_->ioptions.table_properties_collector_factories[i]->Name();
}
property_collectors_names += "]";
rep_->props.property_collectors_names = property_collectors_names;
if (rep_->table_options.index_type ==
BlockBasedTableOptions::kTwoLevelIndexSearch) {
assert(rep_->p_index_builder_ != nullptr);
rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions();
rep_->props.top_level_index_size =
rep_->p_index_builder_->EstimateTopLevelIndexSize(rep_->offset);
}
rep_->props.index_key_is_user_key =
!rep_->index_builder->seperator_is_key_plus_seq();
rep_->props.creation_time = rep_->creation_time;
rep_->props.oldest_key_time = rep_->oldest_key_time;
// Add basic properties
property_block_builder.AddTableProperty(rep_->props);
if (ok() && !r->range_del_block.empty()) { // Add use collected properties
WriteRawBlock(r->range_del_block.Finish(), kNoCompression, NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors,
&range_del_block_handle); rep_->ioptions.info_log,
meta_index_builder.Add(kRangeDelBlock, range_del_block_handle); &property_block_builder);
} // range deletion tombstone meta block
} // meta blocks
// Write index block WriteRawBlock(property_block_builder.Finish(), kNoCompression,
&properties_block_handle);
}
if (ok()) { if (ok()) {
// flush the meta index block meta_index_builder->Add(kPropertiesBlock, properties_block_handle);
WriteRawBlock(meta_index_builder.Finish(), kNoCompression, }
&metaindex_block_handle); }
if (r->table_options.enable_index_compression) { void BlockBasedTableBuilder::WriteCompressionDictBlock(
WriteBlock(index_blocks.index_block_contents, &index_block_handle, false); MetaIndexBuilder* meta_index_builder) {
} else { if (rep_->compression_dict && rep_->compression_dict->size()) {
WriteRawBlock(index_blocks.index_block_contents, kNoCompression, BlockHandle compression_dict_block_handle;
&index_block_handle); if (ok()) {
WriteRawBlock(*rep_->compression_dict, kNoCompression,
&compression_dict_block_handle);
} }
// If there are more index partitions, finish them and write them out if (ok()) {
Status& s = index_builder_status; meta_index_builder->Add(kCompressionDictBlock,
while (s.IsIncomplete()) { compression_dict_block_handle);
s = r->index_builder->Finish(&index_blocks, index_block_handle);
if (!s.ok() && !s.IsIncomplete()) {
return s;
}
if (r->table_options.enable_index_compression) {
WriteBlock(index_blocks.index_block_contents, &index_block_handle,
false);
} else {
WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
&index_block_handle);
}
// The last index_block_handle will be for the partition index block
} }
} }
}
void BlockBasedTableBuilder::WriteRangeDelBlock(
MetaIndexBuilder* meta_index_builder) {
if (ok() && !rep_->range_del_block.empty()) {
BlockHandle range_del_block_handle;
WriteRawBlock(rep_->range_del_block.Finish(), kNoCompression,
&range_del_block_handle);
meta_index_builder->Add(kRangeDelBlock, range_del_block_handle);
}
}
Status BlockBasedTableBuilder::Finish() {
Rep* r = rep_;
bool empty_data_block = r->data_block.empty();
Flush();
assert(!r->closed);
r->closed = true;
// To make sure properties block is able to keep the accurate size of index
// block, we will finish writing all index entries first.
if (ok() && !empty_data_block) {
r->index_builder->AddIndexEntry(
&r->last_key, nullptr /* no next data block */, r->pending_handle);
}
// Write meta blocks and metaindex block with the following order.
// 1. [meta block: filter]
// 2. [meta block: index]
// 3. [meta block: compression dictionary]
// 4. [meta block: range deletion tombstone]
// 5. [meta block: properties]
// 6. [metaindex block]
BlockHandle metaindex_block_handle, index_block_handle;
MetaIndexBuilder meta_index_builder;
WriteFilterBlock(&meta_index_builder);
WriteIndexBlock(&meta_index_builder, &index_block_handle);
WriteCompressionDictBlock(&meta_index_builder);
WriteRangeDelBlock(&meta_index_builder);
WritePropertiesBlock(&meta_index_builder);
if (ok()) {
// flush the meta index block
WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
&metaindex_block_handle);
}
// Write footer // Write footer
if (ok()) { if (ok()) {

@ -18,6 +18,7 @@
#include "rocksdb/listener.h" #include "rocksdb/listener.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "rocksdb/status.h" #include "rocksdb/status.h"
#include "table/meta_blocks.h"
#include "table/table_builder.h" #include "table/table_builder.h"
#include "util/compression.h" #include "util/compression.h"
@ -106,6 +107,14 @@ class BlockBasedTableBuilder : public TableBuilder {
Status InsertBlockInCache(const Slice& block_contents, Status InsertBlockInCache(const Slice& block_contents,
const CompressionType type, const CompressionType type,
const BlockHandle* handle); const BlockHandle* handle);
void WriteFilterBlock(MetaIndexBuilder* meta_index_builder);
void WriteIndexBlock(MetaIndexBuilder* meta_index_builder,
BlockHandle* index_block_handle);
void WritePropertiesBlock(MetaIndexBuilder* meta_index_builder);
void WriteCompressionDictBlock(MetaIndexBuilder* meta_index_builder);
void WriteRangeDelBlock(MetaIndexBuilder* meta_index_builder);
struct Rep; struct Rep;
class BlockBasedTablePropertiesCollectorFactory; class BlockBasedTablePropertiesCollectorFactory;
class BlockBasedTablePropertiesCollector; class BlockBasedTablePropertiesCollector;

@ -352,19 +352,19 @@ class TableConstructor: public Constructor {
file_writer_->Flush(); file_writer_->Flush();
EXPECT_TRUE(s.ok()) << s.ToString(); EXPECT_TRUE(s.ok()) << s.ToString();
EXPECT_EQ(GetSink()->contents().size(), builder->FileSize()); EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize());
// Open the table // Open the table
uniq_id_ = cur_uniq_id_++; uniq_id_ = cur_uniq_id_++;
file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource( file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource(
GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads))); TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
const bool kSkipFilters = true; const bool kSkipFilters = true;
const bool kImmortal = true; const bool kImmortal = true;
return ioptions.table_factory->NewTableReader( return ioptions.table_factory->NewTableReader(
TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
internal_comparator, !kSkipFilters, !kImmortal, internal_comparator, !kSkipFilters, !kImmortal,
level_), level_),
std::move(file_reader_), GetSink()->contents().size(), &table_reader_); std::move(file_reader_), TEST_GetSink()->contents().size(), &table_reader_);
} }
virtual InternalIterator* NewIterator( virtual InternalIterator* NewIterator(
@ -390,11 +390,11 @@ class TableConstructor: public Constructor {
virtual Status Reopen(const ImmutableCFOptions& ioptions, virtual Status Reopen(const ImmutableCFOptions& ioptions,
const MutableCFOptions& moptions) { const MutableCFOptions& moptions) {
file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource( file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource(
GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads))); TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
return ioptions.table_factory->NewTableReader( return ioptions.table_factory->NewTableReader(
TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
*last_internal_key_), *last_internal_key_),
std::move(file_reader_), GetSink()->contents().size(), &table_reader_); std::move(file_reader_), TEST_GetSink()->contents().size(), &table_reader_);
} }
virtual TableReader* GetTableReader() { virtual TableReader* GetTableReader() {
@ -409,6 +409,10 @@ class TableConstructor: public Constructor {
bool ConvertToInternalKey() { return convert_to_internal_key_; } bool ConvertToInternalKey() { return convert_to_internal_key_; }
test::StringSink* TEST_GetSink() {
return static_cast<test::StringSink*>(file_writer_->writable_file());
}
private: private:
void Reset() { void Reset() {
uniq_id_ = 0; uniq_id_ = 0;
@ -417,10 +421,6 @@ class TableConstructor: public Constructor {
file_reader_.reset(); file_reader_.reset();
} }
test::StringSink* GetSink() {
return static_cast<test::StringSink*>(file_writer_->writable_file());
}
uint64_t uniq_id_; uint64_t uniq_id_;
unique_ptr<WritableFileWriter> file_writer_; unique_ptr<WritableFileWriter> file_writer_;
unique_ptr<RandomAccessFileReader> file_reader_; unique_ptr<RandomAccessFileReader> file_reader_;
@ -3494,6 +3494,86 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
} }
} }
TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
// The properties meta-block should come at the end since we always need to
// read it when opening a file, unlike index/filter/other meta-blocks, which
// are sometimes read depending on the user's configuration. This ordering
// allows us to do a small readahead on the end of the file to read properties
// and meta-index blocks with one I/O.
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
c.Add("a1", "val1");
c.Add("b2", "val2");
c.Add("c3", "val3");
c.Add("d4", "val4");
c.Add("e5", "val5");
c.Add("f6", "val6");
c.Add("g7", "val7");
c.Add("h8", "val8");
c.Add("j9", "val9");
// write an SST file
Options options;
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
table_options.filter_policy.reset(NewBloomFilterPolicy(
8 /* bits_per_key */, false /* use_block_based_filter */));
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
ImmutableCFOptions ioptions(options);
MutableCFOptions moptions(options);
std::vector<std::string> keys;
stl_wrappers::KVMap kvmap;
c.Finish(options, ioptions, moptions, table_options,
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
// get file reader
test::StringSink* table_sink = c.TEST_GetSink();
std::unique_ptr<RandomAccessFileReader> table_reader{
test::GetRandomAccessFileReader(
new test::StringSource(table_sink->contents(), 0 /* unique_id */,
false /* allow_mmap_reads */))};
size_t table_size = table_sink->contents().size();
// read footer
Footer footer;
ASSERT_OK(ReadFooterFromFile(table_reader.get(),
nullptr /* prefetch_buffer */, table_size,
&footer, kBlockBasedTableMagicNumber));
// read metaindex
auto metaindex_handle = footer.metaindex_handle();
BlockContents metaindex_contents;
Slice compression_dict;
PersistentCacheOptions pcache_opts;
BlockFetcher block_fetcher(
table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
compression_dict, pcache_opts);
ASSERT_OK(block_fetcher.ReadBlockContents());
Block metaindex_block(std::move(metaindex_contents),
kDisableGlobalSequenceNumber);
// verify properties block comes last
std::unique_ptr<InternalIterator> metaindex_iter{
metaindex_block.NewIterator<DataBlockIter>(options.comparator,
options.comparator)};
uint64_t max_offset = 0;
std::string key_at_max_offset;
for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
metaindex_iter->Next()) {
BlockHandle handle;
Slice value = metaindex_iter->value();
ASSERT_OK(handle.DecodeFrom(&value));
if (handle.offset() > max_offset) {
max_offset = handle.offset();
key_at_max_offset = metaindex_iter->key().ToString();
}
}
ASSERT_EQ(kPropertiesBlock, key_at_max_offset);
// index handle is stored in footer rather than metaindex block, so need
// separate logic to verify it comes before properties block.
ASSERT_GT(max_offset, footer.index_handle().offset());
c.ResetTableReader();
}
TEST_P(BlockBasedTableTest, BadOptions) { TEST_P(BlockBasedTableTest, BadOptions) {
rocksdb::Options options; rocksdb::Options options;
options.compression = kNoCompression; options.compression = kNoCompression;

Loading…
Cancel
Save