|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
|
|
|
|
#include "table/sst_file_dumper.h"
|
|
|
|
|
|
|
|
#include <chrono>
|
|
|
|
#include <cinttypes>
|
|
|
|
#include <iostream>
|
|
|
|
#include <map>
|
|
|
|
#include <memory>
|
|
|
|
#include <sstream>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "db/blob/blob_index.h"
|
|
|
|
#include "db/memtable.h"
|
|
|
|
#include "db/write_batch_internal.h"
|
|
|
|
#include "options/cf_options.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/iterator.h"
|
|
|
|
#include "rocksdb/slice_transform.h"
|
|
|
|
#include "rocksdb/status.h"
|
|
|
|
#include "rocksdb/table_properties.h"
|
|
|
|
#include "rocksdb/utilities/ldb_cmd.h"
|
|
|
|
#include "table/block_based/block.h"
|
|
|
|
#include "table/block_based/block_based_table_builder.h"
|
|
|
|
#include "table/block_based/block_based_table_factory.h"
|
|
|
|
#include "table/block_based/block_builder.h"
|
|
|
|
#include "table/format.h"
|
|
|
|
#include "table/meta_blocks.h"
|
|
|
|
#include "table/plain/plain_table_factory.h"
|
|
|
|
#include "table/table_reader.h"
|
|
|
|
#include "util/compression.h"
|
|
|
|
#include "util/random.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
SstFileDumper::SstFileDumper(const Options& options,
|
|
|
|
const std::string& file_path,
|
|
|
|
size_t readahead_size, bool verify_checksum,
|
|
|
|
bool output_hex, bool decode_blob_index,
|
|
|
|
const EnvOptions& soptions, bool silent)
|
|
|
|
: file_name_(file_path),
|
|
|
|
read_num_(0),
|
|
|
|
output_hex_(output_hex),
|
|
|
|
decode_blob_index_(decode_blob_index),
|
|
|
|
soptions_(soptions),
|
|
|
|
silent_(silent),
|
|
|
|
options_(options),
|
|
|
|
ioptions_(options_),
|
|
|
|
moptions_(ColumnFamilyOptions(options_)),
|
|
|
|
read_options_(verify_checksum, false),
|
|
|
|
internal_comparator_(BytewiseComparator()) {
|
|
|
|
read_options_.readahead_size = readahead_size;
|
|
|
|
if (!silent_) {
|
|
|
|
fprintf(stdout, "Process %s\n", file_path.c_str());
|
|
|
|
}
|
|
|
|
init_result_ = GetTableReader(file_name_);
|
|
|
|
}
|
|
|
|
|
|
|
|
extern const uint64_t kBlockBasedTableMagicNumber;
|
|
|
|
extern const uint64_t kLegacyBlockBasedTableMagicNumber;
|
|
|
|
extern const uint64_t kPlainTableMagicNumber;
|
|
|
|
extern const uint64_t kLegacyPlainTableMagicNumber;
|
|
|
|
|
|
|
|
const char* testFileName = "test_file_name";
|
|
|
|
|
|
|
|
Status SstFileDumper::GetTableReader(const std::string& file_path) {
|
|
|
|
// Warning about 'magic_number' being uninitialized shows up only in UBsan
|
|
|
|
// builds. Though access is guarded by 's.ok()' checks, fix the issue to
|
|
|
|
// avoid any warnings.
|
|
|
|
uint64_t magic_number = Footer::kNullTableMagicNumber;
|
|
|
|
|
|
|
|
// read table magic number
|
|
|
|
Footer footer;
|
|
|
|
|
|
|
|
const auto& fs = options_.env->GetFileSystem();
|
|
|
|
std::unique_ptr<FSRandomAccessFile> file;
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
Status s = fs->NewRandomAccessFile(file_path, FileOptions(soptions_), &file,
|
|
|
|
nullptr);
|
|
|
|
if (s.ok()) {
|
|
|
|
s = fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
// check empty file
|
|
|
|
// if true, skip further processing of this file
|
|
|
|
if (file_size == 0) {
|
|
|
|
return Status::Aborted(file_path, "Empty file");
|
|
|
|
}
|
|
|
|
|
|
|
|
file_.reset(new RandomAccessFileReader(std::move(file), file_path));
|
|
|
|
|
|
|
|
FilePrefetchBuffer prefetch_buffer(
|
|
|
|
0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */,
|
|
|
|
false /* track_min_offset */);
|
|
|
|
if (s.ok()) {
|
|
|
|
const uint64_t kSstDumpTailPrefetchSize = 512 * 1024;
|
|
|
|
uint64_t prefetch_size = (file_size > kSstDumpTailPrefetchSize)
|
|
|
|
? kSstDumpTailPrefetchSize
|
|
|
|
: file_size;
|
|
|
|
uint64_t prefetch_off = file_size - prefetch_size;
|
|
|
|
IOOptions opts;
|
|
|
|
s = prefetch_buffer.Prefetch(opts, file_.get(), prefetch_off,
|
|
|
|
static_cast<size_t>(prefetch_size));
|
|
|
|
|
|
|
|
s = ReadFooterFromFile(opts, file_.get(), &prefetch_buffer, file_size,
|
|
|
|
&footer);
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
magic_number = footer.table_magic_number();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
if (magic_number == kPlainTableMagicNumber ||
|
|
|
|
magic_number == kLegacyPlainTableMagicNumber) {
|
|
|
|
soptions_.use_mmap_reads = true;
|
|
|
|
|
|
|
|
fs->NewRandomAccessFile(file_path, FileOptions(soptions_), &file,
|
|
|
|
nullptr);
|
|
|
|
file_.reset(new RandomAccessFileReader(std::move(file), file_path));
|
|
|
|
}
|
|
|
|
|
|
|
|
// For old sst format, ReadTableProperties might fail but file can be read
|
|
|
|
if (ReadTableProperties(magic_number, file_.get(), file_size,
|
|
|
|
(magic_number == kBlockBasedTableMagicNumber)
|
|
|
|
? &prefetch_buffer
|
|
|
|
: nullptr)
|
|
|
|
.ok()) {
|
|
|
|
s = SetTableOptionsByMagicNumber(magic_number);
|
|
|
|
if (s.ok()) {
|
|
|
|
if (table_properties_ && !table_properties_->comparator_name.empty()) {
|
|
|
|
ConfigOptions config_options;
|
|
|
|
const Comparator* user_comparator = nullptr;
|
|
|
|
s = Comparator::CreateFromString(config_options,
|
|
|
|
table_properties_->comparator_name,
|
|
|
|
&user_comparator);
|
|
|
|
if (s.ok()) {
|
|
|
|
assert(user_comparator);
|
|
|
|
internal_comparator_ =
|
|
|
|
InternalKeyComparator(user_comparator, /*named=*/true);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
s = SetOldTableOptions();
|
|
|
|
}
|
|
|
|
options_.comparator = internal_comparator_.user_comparator();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size,
|
|
|
|
&table_reader_);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::NewTableReader(
|
|
|
|
const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
|
|
|
|
const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
|
|
|
|
std::unique_ptr<TableReader>* /*table_reader*/) {
|
|
|
|
auto t_opt =
|
|
|
|
TableReaderOptions(ioptions_, moptions_.prefix_extractor, soptions_,
|
|
|
|
internal_comparator_, false /* skip_filters */,
|
|
|
|
false /* imortal */, true /* force_direct_prefetch */);
|
|
|
|
// Allow open file with global sequence number for backward compatibility.
|
|
|
|
t_opt.largest_seqno = kMaxSequenceNumber;
|
|
|
|
|
|
|
|
// We need to turn off pre-fetching of index and filter nodes for
|
|
|
|
// BlockBasedTable
|
|
|
|
if (options_.table_factory->IsInstanceOf(
|
|
|
|
TableFactory::kBlockBasedTableName())) {
|
|
|
|
return options_.table_factory->NewTableReader(t_opt, std::move(file_),
|
|
|
|
file_size, &table_reader_,
|
|
|
|
/*enable_prefetch=*/false);
|
|
|
|
}
|
|
|
|
|
|
|
|
// For all other factory implementation
|
|
|
|
return options_.table_factory->NewTableReader(t_opt, std::move(file_),
|
|
|
|
file_size, &table_reader_);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::VerifyChecksum() {
|
|
|
|
// We could pass specific readahead setting into read options if needed.
|
|
|
|
return table_reader_->VerifyChecksum(read_options_,
|
|
|
|
TableReaderCaller::kSSTDumpTool);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::DumpTable(const std::string& out_filename) {
|
|
|
|
std::unique_ptr<WritableFile> out_file;
|
|
|
|
Env* env = options_.env;
|
|
|
|
Status s = env->NewWritableFile(out_filename, &out_file, soptions_);
|
|
|
|
if (s.ok()) {
|
|
|
|
s = table_reader_->DumpTable(out_file.get());
|
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
// close the file before return error, ignore the close error if there's any
|
|
|
|
out_file->Close().PermitUncheckedError();
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return out_file->Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::CalculateCompressedTableSize(
|
|
|
|
const TableBuilderOptions& tb_options, size_t block_size,
|
|
|
|
uint64_t* num_data_blocks, uint64_t* compressed_table_size) {
|
|
|
|
std::unique_ptr<Env> env(NewMemEnv(options_.env));
|
|
|
|
std::unique_ptr<WritableFileWriter> dest_writer;
|
|
|
|
Status s =
|
|
|
|
WritableFileWriter::Create(env->GetFileSystem(), testFileName,
|
|
|
|
FileOptions(soptions_), &dest_writer, nullptr);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = block_size;
|
|
|
|
BlockBasedTableFactory block_based_tf(table_options);
|
|
|
|
std::unique_ptr<TableBuilder> table_builder;
|
|
|
|
table_builder.reset(block_based_tf.NewTableBuilder(
|
|
|
|
tb_options,
|
|
|
|
dest_writer.get()));
|
|
|
|
std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
|
|
|
|
read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
table_builder->Add(iter->key(), iter->value());
|
|
|
|
}
|
|
|
|
s = iter->status();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
s = table_builder->Finish();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
*compressed_table_size = table_builder->FileSize();
|
|
|
|
assert(num_data_blocks != nullptr);
|
|
|
|
*num_data_blocks = table_builder->GetTableProperties().num_data_blocks;
|
|
|
|
return env->DeleteFile(testFileName);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::ShowAllCompressionSizes(
|
|
|
|
size_t block_size,
|
|
|
|
const std::vector<std::pair<CompressionType, const char*>>&
|
|
|
|
compression_types,
|
|
|
|
int32_t compress_level_from, int32_t compress_level_to,
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
4 years ago
|
|
|
uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
|
|
|
|
uint64_t max_dict_buffer_bytes) {
|
|
|
|
fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
|
|
|
|
for (auto& i : compression_types) {
|
|
|
|
if (CompressionTypeSupported(i.first)) {
|
|
|
|
fprintf(stdout, "Compression: %-24s\n", i.second);
|
|
|
|
CompressionOptions compress_opt;
|
|
|
|
compress_opt.max_dict_bytes = max_dict_bytes;
|
|
|
|
compress_opt.zstd_max_train_bytes = zstd_max_train_bytes;
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
4 years ago
|
|
|
compress_opt.max_dict_buffer_bytes = max_dict_buffer_bytes;
|
|
|
|
for (int32_t j = compress_level_from; j <= compress_level_to; j++) {
|
|
|
|
fprintf(stdout, "Compression level: %d", j);
|
|
|
|
compress_opt.level = j;
|
|
|
|
Status s = ShowCompressionSize(block_size, i.first, compress_opt);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
fprintf(stdout, "Unsupported compression type: %s.\n", i.second);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::ShowCompressionSize(
|
|
|
|
size_t block_size, CompressionType compress_type,
|
|
|
|
const CompressionOptions& compress_opt) {
|
|
|
|
Options opts;
|
|
|
|
opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
opts.statistics->set_stats_level(StatsLevel::kAll);
|
|
|
|
const ImmutableOptions imoptions(opts);
|
|
|
|
const ColumnFamilyOptions cfo(opts);
|
|
|
|
const MutableCFOptions moptions(cfo);
|
|
|
|
ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
|
|
|
|
IntTblPropCollectorFactories block_based_table_factories;
|
|
|
|
|
|
|
|
std::string column_family_name;
|
|
|
|
int unknown_level = -1;
|
|
|
|
TableBuilderOptions tb_opts(
|
|
|
|
imoptions, moptions, ikc, &block_based_table_factories, compress_type,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
4 years ago
|
|
|
compress_opt,
|
|
|
|
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
|
|
|
|
column_family_name, unknown_level);
|
|
|
|
uint64_t num_data_blocks = 0;
|
|
|
|
std::chrono::steady_clock::time_point start =
|
|
|
|
std::chrono::steady_clock::now();
|
|
|
|
uint64_t file_size;
|
|
|
|
Status s = CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks,
|
|
|
|
&file_size);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
|
|
|
|
fprintf(stdout, " Size: %10" PRIu64, file_size);
|
|
|
|
fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks);
|
|
|
|
fprintf(stdout, " Time Taken: %10s microsecs",
|
|
|
|
std::to_string(
|
|
|
|
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
|
|
|
|
.count())
|
|
|
|
.c_str());
|
|
|
|
const uint64_t compressed_blocks =
|
|
|
|
opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED);
|
|
|
|
const uint64_t not_compressed_blocks =
|
|
|
|
opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_NOT_COMPRESSED);
|
|
|
|
// When the option enable_index_compression is true,
|
|
|
|
// NUMBER_BLOCK_COMPRESSED is incremented for index block(s).
|
|
|
|
if ((compressed_blocks + not_compressed_blocks) > num_data_blocks) {
|
|
|
|
num_data_blocks = compressed_blocks + not_compressed_blocks;
|
|
|
|
}
|
|
|
|
|
|
|
|
const uint64_t ratio_not_compressed_blocks =
|
|
|
|
(num_data_blocks - compressed_blocks) - not_compressed_blocks;
|
|
|
|
const double compressed_pcnt =
|
|
|
|
(0 == num_data_blocks) ? 0.0
|
|
|
|
: ((static_cast<double>(compressed_blocks) /
|
|
|
|
static_cast<double>(num_data_blocks)) *
|
|
|
|
100.0);
|
|
|
|
const double ratio_not_compressed_pcnt =
|
|
|
|
(0 == num_data_blocks)
|
|
|
|
? 0.0
|
|
|
|
: ((static_cast<double>(ratio_not_compressed_blocks) /
|
|
|
|
static_cast<double>(num_data_blocks)) *
|
|
|
|
100.0);
|
|
|
|
const double not_compressed_pcnt =
|
|
|
|
(0 == num_data_blocks) ? 0.0
|
|
|
|
: ((static_cast<double>(not_compressed_blocks) /
|
|
|
|
static_cast<double>(num_data_blocks)) *
|
|
|
|
100.0);
|
|
|
|
fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
|
|
|
|
compressed_pcnt);
|
|
|
|
fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)",
|
|
|
|
ratio_not_compressed_blocks, ratio_not_compressed_pcnt);
|
|
|
|
fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n",
|
|
|
|
not_compressed_blocks, not_compressed_pcnt);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
3 years ago
|
|
|
// Reads TableProperties prior to opening table reader in order to set up
|
|
|
|
// options.
|
|
|
|
Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
|
|
|
|
RandomAccessFileReader* file,
|
|
|
|
uint64_t file_size,
|
|
|
|
FilePrefetchBuffer* prefetch_buffer) {
|
|
|
|
Status s = ROCKSDB_NAMESPACE::ReadTableProperties(
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
3 years ago
|
|
|
file, file_size, table_magic_number, ioptions_, &table_properties_,
|
|
|
|
/* memory_allocator= */ nullptr, prefetch_buffer);
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
3 years ago
|
|
|
if (!s.ok()) {
|
|
|
|
if (!silent_) {
|
|
|
|
fprintf(stdout, "Not able to read table properties\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::SetTableOptionsByMagicNumber(
|
|
|
|
uint64_t table_magic_number) {
|
|
|
|
assert(table_properties_);
|
|
|
|
if (table_magic_number == kBlockBasedTableMagicNumber ||
|
|
|
|
table_magic_number == kLegacyBlockBasedTableMagicNumber) {
|
|
|
|
BlockBasedTableFactory* bbtf = new BlockBasedTableFactory();
|
|
|
|
// To force tail prefetching, we fake reporting two useful reads of 512KB
|
|
|
|
// from the tail.
|
|
|
|
// It needs at least two data points to warm up the stats.
|
|
|
|
bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
|
|
|
|
bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
|
|
|
|
|
|
|
|
options_.table_factory.reset(bbtf);
|
|
|
|
if (!silent_) {
|
|
|
|
fprintf(stdout, "Sst file format: block-based\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
auto& props = table_properties_->user_collected_properties;
|
|
|
|
auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
|
|
|
|
if (pos != props.end()) {
|
|
|
|
auto index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
|
|
|
|
DecodeFixed32(pos->second.c_str()));
|
|
|
|
if (index_type_on_file ==
|
|
|
|
BlockBasedTableOptions::IndexType::kHashSearch) {
|
|
|
|
options_.prefix_extractor.reset(NewNoopTransform());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (table_magic_number == kPlainTableMagicNumber ||
|
|
|
|
table_magic_number == kLegacyPlainTableMagicNumber) {
|
|
|
|
options_.allow_mmap_reads = true;
|
|
|
|
|
|
|
|
PlainTableOptions plain_table_options;
|
|
|
|
plain_table_options.user_key_len = kPlainTableVariableLength;
|
|
|
|
plain_table_options.bloom_bits_per_key = 0;
|
|
|
|
plain_table_options.hash_table_ratio = 0;
|
|
|
|
plain_table_options.index_sparseness = 1;
|
|
|
|
plain_table_options.huge_page_tlb_size = 0;
|
|
|
|
plain_table_options.encoding_type = kPlain;
|
|
|
|
plain_table_options.full_scan_mode = true;
|
|
|
|
|
|
|
|
options_.table_factory.reset(NewPlainTableFactory(plain_table_options));
|
|
|
|
if (!silent_) {
|
|
|
|
fprintf(stdout, "Sst file format: plain table\n");
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
char error_msg_buffer[80];
|
|
|
|
snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1,
|
|
|
|
"Unsupported table magic number --- %lx",
|
|
|
|
(long)table_magic_number);
|
|
|
|
return Status::InvalidArgument(error_msg_buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::SetOldTableOptions() {
|
|
|
|
assert(table_properties_ == nullptr);
|
|
|
|
options_.table_factory = std::make_shared<BlockBasedTableFactory>();
|
|
|
|
if (!silent_) {
|
|
|
|
fprintf(stdout, "Sst file format: block-based(old version)\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
|
|
|
|
bool has_from, const std::string& from_key,
|
|
|
|
bool has_to, const std::string& to_key,
|
|
|
|
bool use_from_as_prefix) {
|
|
|
|
if (!table_reader_) {
|
|
|
|
return init_result_;
|
|
|
|
}
|
|
|
|
|
|
|
|
InternalIterator* iter = table_reader_->NewIterator(
|
|
|
|
read_options_, moptions_.prefix_extractor.get(),
|
|
|
|
/*arena=*/nullptr, /*skip_filters=*/false,
|
|
|
|
TableReaderCaller::kSSTDumpTool);
|
|
|
|
uint64_t i = 0;
|
|
|
|
if (has_from) {
|
|
|
|
InternalKey ikey;
|
|
|
|
ikey.SetMinPossibleForUserKey(from_key);
|
|
|
|
iter->Seek(ikey.Encode());
|
|
|
|
} else {
|
|
|
|
iter->SeekToFirst();
|
|
|
|
}
|
|
|
|
for (; iter->Valid(); iter->Next()) {
|
|
|
|
Slice key = iter->key();
|
|
|
|
Slice value = iter->value();
|
|
|
|
++i;
|
|
|
|
if (read_num > 0 && i > read_num) break;
|
|
|
|
|
|
|
|
ParsedInternalKey ikey;
|
|
|
|
Status pik_status = ParseInternalKey(key, &ikey, true /* log_err_key */);
|
|
|
|
if (!pik_status.ok()) {
|
|
|
|
std::cerr << pik_status.getState() << "\n";
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// the key returned is not prefixed with out 'from' key
|
|
|
|
if (use_from_as_prefix && !ikey.user_key.starts_with(from_key)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If end marker was specified, we stop before it
|
|
|
|
if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (print_kv) {
|
|
|
|
if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) {
|
|
|
|
fprintf(stdout, "%s => %s\n",
|
|
|
|
ikey.DebugString(true, output_hex_).c_str(),
|
|
|
|
value.ToString(output_hex_).c_str());
|
|
|
|
} else {
|
|
|
|
BlobIndex blob_index;
|
|
|
|
|
|
|
|
const Status s = blob_index.DecodeFrom(value);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "%s => error decoding blob index\n",
|
|
|
|
ikey.DebugString(true, output_hex_).c_str());
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
fprintf(stdout, "%s => %s\n",
|
|
|
|
ikey.DebugString(true, output_hex_).c_str(),
|
|
|
|
blob_index.DebugString(output_hex_).c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
read_num_ += i;
|
|
|
|
|
|
|
|
Status ret = iter->status();
|
|
|
|
delete iter;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
3 years ago
|
|
|
// Provides TableProperties to API user
|
|
|
|
Status SstFileDumper::ReadTableProperties(
|
|
|
|
std::shared_ptr<const TableProperties>* table_properties) {
|
|
|
|
if (!table_reader_) {
|
|
|
|
return init_result_;
|
|
|
|
}
|
|
|
|
|
|
|
|
*table_properties = table_reader_->GetTableProperties();
|
|
|
|
return init_result_;
|
|
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
#endif // ROCKSDB_LITE
|