|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#include "table/block_fetcher.h"
|
|
|
|
|
|
|
|
#include "db/table_properties_collector.h"
|
|
|
|
#include "file/file_util.h"
|
|
|
|
#include "options/options_helper.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "port/stack_trace.h"
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/file_system.h"
|
|
|
|
#include "table/block_based/binary_search_index_reader.h"
|
|
|
|
#include "table/block_based/block_based_table_builder.h"
|
|
|
|
#include "table/block_based/block_based_table_factory.h"
|
|
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
|
|
#include "table/format.h"
|
|
|
|
#include "test_util/testharness.h"
|
|
|
|
#include "utilities/memory_allocators.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
namespace {
|
|
|
|
struct MemcpyStats {
|
|
|
|
int num_stack_buf_memcpy;
|
|
|
|
int num_heap_buf_memcpy;
|
|
|
|
int num_compressed_buf_memcpy;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct BufAllocationStats {
|
|
|
|
int num_heap_buf_allocations;
|
|
|
|
int num_compressed_buf_allocations;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct TestStats {
|
|
|
|
MemcpyStats memcpy_stats;
|
|
|
|
BufAllocationStats buf_allocation_stats;
|
|
|
|
};
|
|
|
|
|
|
|
|
class BlockFetcherTest : public testing::Test {
|
|
|
|
public:
|
|
|
|
enum class Mode {
|
|
|
|
kBufferedRead = 0,
|
|
|
|
kBufferedMmap,
|
|
|
|
kDirectRead,
|
|
|
|
kNumModes,
|
|
|
|
};
|
|
|
|
// use NumModes as array size to avoid "size of array '...' has non-integral
|
|
|
|
// type" errors.
|
|
|
|
const static int NumModes = static_cast<int>(Mode::kNumModes);
|
|
|
|
|
|
|
|
protected:
|
|
|
|
void SetUp() override {
|
|
|
|
SetupSyncPointsToMockDirectIO();
|
|
|
|
test_dir_ = test::PerThreadDBPath("block_fetcher_test");
|
|
|
|
env_ = Env::Default();
|
|
|
|
fs_ = FileSystem::Default();
|
|
|
|
ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
|
|
|
|
}
|
|
|
|
|
|
|
|
void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
|
|
|
|
|
|
|
|
void AssertSameBlock(const std::string& block1, const std::string& block2) {
|
|
|
|
ASSERT_EQ(block1, block2);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Creates a table with kv pairs (i, i) where i ranges from 0 to 9, inclusive.
|
|
|
|
void CreateTable(const std::string& table_name,
|
|
|
|
const CompressionType& compression_type) {
|
|
|
|
std::unique_ptr<WritableFileWriter> writer;
|
|
|
|
NewFileWriter(table_name, &writer);
|
|
|
|
|
|
|
|
// Create table builder.
|
|
|
|
ImmutableOptions ioptions(options_);
|
|
|
|
InternalKeyComparator comparator(options_.comparator);
|
|
|
|
ColumnFamilyOptions cf_options(options_);
|
|
|
|
MutableCFOptions moptions(cf_options);
|
|
|
|
IntTblPropCollectorFactories factories;
|
|
|
|
std::unique_ptr<TableBuilder> table_builder(table_factory_.NewTableBuilder(
|
|
|
|
TableBuilderOptions(ioptions, moptions, comparator, &factories,
|
|
|
|
compression_type, CompressionOptions(),
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
4 years ago
|
|
|
0 /* column_family_id */, kDefaultColumnFamilyName,
|
|
|
|
-1 /* level */),
|
|
|
|
writer.get()));
|
|
|
|
|
|
|
|
// Build table.
|
|
|
|
for (int i = 0; i < 9; i++) {
|
|
|
|
std::string key = ToInternalKey(std::to_string(i));
|
Fix testcase failures on windows (#7992)
Summary:
Fixed 5 test case failures found on Windows 10/Windows Server 2016
1. In `flush_job_test`, the DestroyDir function fails in deconstructor because some file handles are still being held by VersionSet. This happens on Windows Server 2016, so need to manually reset versions_ pointer to release all file handles.
2. In `StatsHistoryTest.InMemoryStatsHistoryPurging` test, the capping memory cost of stats_history_size on Windows becomes 14000 bytes with latest changes, not just 13000 bytes.
3. In `SSTDumpToolTest.RawOutput` test, the output file handle is not closed at the end.
4. In `FullBloomTest.OptimizeForMemory` test, ROCKSDB_MALLOC_USABLE_SIZE is undefined on windows so `total_mem` is always equal to `total_size`. The internal memory fragmentation assertion does not apply in this case.
5. In `BlockFetcherTest.FetchAndUncompressCompressedDataBlock` test, XPRESS cannot reach 87.5% compression ratio with original CreateTable method, so I append extra zeros to the string value to enhance compression ratio. Beside, since XPRESS allocates memory internally, thus does not support for custom allocator verification, we will skip the allocator verification for XPRESS
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7992
Reviewed By: jay-zhuang
Differential Revision: D26615283
Pulled By: ajkr
fbshipit-source-id: 3632612f84b99e2b9c77c403b112b6bedf3b125d
4 years ago
|
|
|
// Append "00000000" to string value to enhance compression ratio
|
|
|
|
std::string value = "00000000" + std::to_string(i);
|
|
|
|
table_builder->Add(key, value);
|
|
|
|
}
|
|
|
|
ASSERT_OK(table_builder->Finish());
|
|
|
|
}
|
|
|
|
|
|
|
|
void FetchIndexBlock(const std::string& table_name,
|
|
|
|
CountedMemoryAllocator* heap_buf_allocator,
|
|
|
|
CountedMemoryAllocator* compressed_buf_allocator,
|
|
|
|
MemcpyStats* memcpy_stats, BlockContents* index_block,
|
|
|
|
std::string* result) {
|
|
|
|
FileOptions fopt(options_);
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file;
|
|
|
|
NewFileReader(table_name, fopt, &file);
|
|
|
|
|
|
|
|
// Get handle of the index block.
|
|
|
|
Footer footer;
|
|
|
|
ReadFooter(file.get(), &footer);
|
|
|
|
const BlockHandle& index_handle = footer.index_handle();
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2 years ago
|
|
|
// FIXME: index handle will need to come from metaindex for
|
|
|
|
// format_version >= 6 when that becomes the default
|
|
|
|
ASSERT_FALSE(index_handle.IsNull());
|
|
|
|
|
|
|
|
CompressionType compression_type;
|
|
|
|
FetchBlock(file.get(), index_handle, BlockType::kIndex,
|
|
|
|
false /* compressed */, false /* do_uncompress */,
|
|
|
|
heap_buf_allocator, compressed_buf_allocator, index_block,
|
|
|
|
memcpy_stats, &compression_type);
|
|
|
|
ASSERT_EQ(compression_type, CompressionType::kNoCompression);
|
|
|
|
result->assign(index_block->data.ToString());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fetches the first data block in both direct IO and non-direct IO mode.
|
|
|
|
//
|
|
|
|
// compressed: whether the data blocks are compressed;
|
|
|
|
// do_uncompress: whether the data blocks should be uncompressed on fetching.
|
|
|
|
// compression_type: the expected compression type.
|
|
|
|
//
|
|
|
|
// Expects:
|
|
|
|
// Block contents are the same.
|
|
|
|
// Bufferr allocation and memory copy statistics are expected.
|
|
|
|
void TestFetchDataBlock(
|
|
|
|
const std::string& table_name_prefix, bool compressed, bool do_uncompress,
|
|
|
|
std::array<TestStats, NumModes> expected_stats_by_mode) {
|
|
|
|
for (CompressionType compression_type : GetSupportedCompressions()) {
|
|
|
|
bool do_compress = compression_type != kNoCompression;
|
|
|
|
if (compressed != do_compress) continue;
|
|
|
|
std::string compression_type_str =
|
|
|
|
CompressionTypeToString(compression_type);
|
|
|
|
|
|
|
|
std::string table_name = table_name_prefix + compression_type_str;
|
|
|
|
CreateTable(table_name, compression_type);
|
|
|
|
|
|
|
|
CompressionType expected_compression_type_after_fetch =
|
|
|
|
(compressed && !do_uncompress) ? compression_type : kNoCompression;
|
|
|
|
|
|
|
|
BlockContents blocks[NumModes];
|
|
|
|
std::string block_datas[NumModes];
|
|
|
|
MemcpyStats memcpy_stats[NumModes];
|
|
|
|
CountedMemoryAllocator heap_buf_allocators[NumModes];
|
|
|
|
CountedMemoryAllocator compressed_buf_allocators[NumModes];
|
|
|
|
for (int i = 0; i < NumModes; ++i) {
|
|
|
|
SetMode(static_cast<Mode>(i));
|
|
|
|
FetchFirstDataBlock(table_name, compressed, do_uncompress,
|
|
|
|
expected_compression_type_after_fetch,
|
|
|
|
&heap_buf_allocators[i],
|
|
|
|
&compressed_buf_allocators[i], &blocks[i],
|
|
|
|
&block_datas[i], &memcpy_stats[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < NumModes - 1; ++i) {
|
|
|
|
AssertSameBlock(block_datas[i], block_datas[i + 1]);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check memcpy and buffer allocation statistics.
|
|
|
|
for (int i = 0; i < NumModes; ++i) {
|
|
|
|
const TestStats& expected_stats = expected_stats_by_mode[i];
|
|
|
|
|
|
|
|
ASSERT_EQ(memcpy_stats[i].num_stack_buf_memcpy,
|
|
|
|
expected_stats.memcpy_stats.num_stack_buf_memcpy);
|
|
|
|
ASSERT_EQ(memcpy_stats[i].num_heap_buf_memcpy,
|
|
|
|
expected_stats.memcpy_stats.num_heap_buf_memcpy);
|
|
|
|
ASSERT_EQ(memcpy_stats[i].num_compressed_buf_memcpy,
|
|
|
|
expected_stats.memcpy_stats.num_compressed_buf_memcpy);
|
|
|
|
|
Fix testcase failures on windows (#7992)
Summary:
Fixed 5 test case failures found on Windows 10/Windows Server 2016
1. In `flush_job_test`, the DestroyDir function fails in deconstructor because some file handles are still being held by VersionSet. This happens on Windows Server 2016, so need to manually reset versions_ pointer to release all file handles.
2. In `StatsHistoryTest.InMemoryStatsHistoryPurging` test, the capping memory cost of stats_history_size on Windows becomes 14000 bytes with latest changes, not just 13000 bytes.
3. In `SSTDumpToolTest.RawOutput` test, the output file handle is not closed at the end.
4. In `FullBloomTest.OptimizeForMemory` test, ROCKSDB_MALLOC_USABLE_SIZE is undefined on windows so `total_mem` is always equal to `total_size`. The internal memory fragmentation assertion does not apply in this case.
5. In `BlockFetcherTest.FetchAndUncompressCompressedDataBlock` test, XPRESS cannot reach 87.5% compression ratio with original CreateTable method, so I append extra zeros to the string value to enhance compression ratio. Beside, since XPRESS allocates memory internally, thus does not support for custom allocator verification, we will skip the allocator verification for XPRESS
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7992
Reviewed By: jay-zhuang
Differential Revision: D26615283
Pulled By: ajkr
fbshipit-source-id: 3632612f84b99e2b9c77c403b112b6bedf3b125d
4 years ago
|
|
|
if (kXpressCompression == compression_type) {
|
|
|
|
// XPRESS allocates memory internally, thus does not support for
|
|
|
|
// custom allocator verification
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(
|
|
|
|
heap_buf_allocators[i].GetNumAllocations(),
|
|
|
|
expected_stats.buf_allocation_stats.num_heap_buf_allocations);
|
|
|
|
ASSERT_EQ(compressed_buf_allocators[i].GetNumAllocations(),
|
|
|
|
expected_stats.buf_allocation_stats
|
|
|
|
.num_compressed_buf_allocations);
|
|
|
|
|
|
|
|
// The allocated buffers are not deallocated until
|
|
|
|
// the block content is deleted.
|
|
|
|
ASSERT_EQ(heap_buf_allocators[i].GetNumDeallocations(), 0);
|
|
|
|
ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(), 0);
|
|
|
|
blocks[i].allocation.reset();
|
|
|
|
ASSERT_EQ(
|
|
|
|
heap_buf_allocators[i].GetNumDeallocations(),
|
|
|
|
expected_stats.buf_allocation_stats.num_heap_buf_allocations);
|
|
|
|
ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(),
|
|
|
|
expected_stats.buf_allocation_stats
|
|
|
|
.num_compressed_buf_allocations);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetMode(Mode mode) {
|
|
|
|
switch (mode) {
|
|
|
|
case Mode::kBufferedRead:
|
|
|
|
options_.use_direct_reads = false;
|
|
|
|
options_.allow_mmap_reads = false;
|
|
|
|
break;
|
|
|
|
case Mode::kBufferedMmap:
|
|
|
|
options_.use_direct_reads = false;
|
|
|
|
options_.allow_mmap_reads = true;
|
|
|
|
break;
|
|
|
|
case Mode::kDirectRead:
|
|
|
|
options_.use_direct_reads = true;
|
|
|
|
options_.allow_mmap_reads = false;
|
|
|
|
break;
|
|
|
|
case Mode::kNumModes:
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::string test_dir_;
|
|
|
|
Env* env_;
|
|
|
|
std::shared_ptr<FileSystem> fs_;
|
|
|
|
BlockBasedTableFactory table_factory_;
|
|
|
|
Options options_;
|
|
|
|
|
|
|
|
std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
|
|
|
|
|
|
|
|
void WriteToFile(const std::string& content, const std::string& filename) {
|
|
|
|
std::unique_ptr<FSWritableFile> f;
|
|
|
|
ASSERT_OK(fs_->NewWritableFile(Path(filename), FileOptions(), &f, nullptr));
|
|
|
|
ASSERT_OK(f->Append(content, IOOptions(), nullptr));
|
|
|
|
ASSERT_OK(f->Close(IOOptions(), nullptr));
|
|
|
|
}
|
|
|
|
|
|
|
|
void NewFileWriter(const std::string& filename,
|
|
|
|
std::unique_ptr<WritableFileWriter>* writer) {
|
|
|
|
std::string path = Path(filename);
|
|
|
|
FileOptions file_options;
|
|
|
|
ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), path,
|
|
|
|
file_options, writer, nullptr));
|
|
|
|
}
|
|
|
|
|
|
|
|
void NewFileReader(const std::string& filename, const FileOptions& opt,
|
|
|
|
std::unique_ptr<RandomAccessFileReader>* reader) {
|
|
|
|
std::string path = Path(filename);
|
|
|
|
std::unique_ptr<FSRandomAccessFile> f;
|
|
|
|
ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr));
|
|
|
|
reader->reset(new RandomAccessFileReader(std::move(f), path,
|
|
|
|
env_->GetSystemClock().get()));
|
|
|
|
}
|
|
|
|
|
|
|
|
void NewTableReader(const ImmutableOptions& ioptions,
|
|
|
|
const FileOptions& foptions,
|
|
|
|
const InternalKeyComparator& comparator,
|
|
|
|
const std::string& table_name,
|
|
|
|
std::unique_ptr<BlockBasedTable>* table) {
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file;
|
|
|
|
NewFileReader(table_name, foptions, &file);
|
|
|
|
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size));
|
|
|
|
|
|
|
|
std::unique_ptr<TableReader> table_reader;
|
|
|
|
ReadOptions ro;
|
|
|
|
const auto* table_options =
|
|
|
|
table_factory_.GetOptions<BlockBasedTableOptions>();
|
|
|
|
ASSERT_NE(table_options, nullptr);
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2 years ago
|
|
|
ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options,
|
|
|
|
comparator, std::move(file), file_size,
|
|
|
|
0 /* block_protection_bytes_per_key */,
|
|
|
|
&table_reader, 0 /* tail_size */));
|
|
|
|
|
|
|
|
table->reset(reinterpret_cast<BlockBasedTable*>(table_reader.release()));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string ToInternalKey(const std::string& key) {
|
|
|
|
InternalKey internal_key(key, 0, ValueType::kTypeValue);
|
|
|
|
return internal_key.Encode().ToString();
|
|
|
|
}
|
|
|
|
|
|
|
|
void ReadFooter(RandomAccessFileReader* file, Footer* footer) {
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
ASSERT_OK(env_->GetFileSize(file->file_name(), &file_size));
|
|
|
|
IOOptions opts;
|
|
|
|
ASSERT_OK(ReadFooterFromFile(opts, file, *fs_,
|
|
|
|
nullptr /* prefetch_buffer */, file_size,
|
|
|
|
footer, kBlockBasedTableMagicNumber));
|
|
|
|
}
|
|
|
|
|
|
|
|
// NOTE: compression_type returns the compression type of the fetched block
|
|
|
|
// contents, so if the block is fetched and uncompressed, then it's
|
|
|
|
// kNoCompression.
|
|
|
|
void FetchBlock(RandomAccessFileReader* file, const BlockHandle& block,
|
|
|
|
BlockType block_type, bool compressed, bool do_uncompress,
|
|
|
|
MemoryAllocator* heap_buf_allocator,
|
|
|
|
MemoryAllocator* compressed_buf_allocator,
|
|
|
|
BlockContents* contents, MemcpyStats* stats,
|
|
|
|
CompressionType* compresstion_type) {
|
|
|
|
ImmutableOptions ioptions(options_);
|
|
|
|
ReadOptions roptions;
|
|
|
|
PersistentCacheOptions persistent_cache_options;
|
|
|
|
Footer footer;
|
|
|
|
ReadFooter(file, &footer);
|
|
|
|
std::unique_ptr<BlockFetcher> fetcher(new BlockFetcher(
|
|
|
|
file, nullptr /* prefetch_buffer */, footer, roptions, block, contents,
|
|
|
|
ioptions, do_uncompress, compressed, block_type,
|
|
|
|
UncompressionDict::GetEmptyDict(), persistent_cache_options,
|
|
|
|
heap_buf_allocator, compressed_buf_allocator));
|
|
|
|
|
|
|
|
ASSERT_OK(fetcher->ReadBlockContents());
|
|
|
|
|
|
|
|
stats->num_stack_buf_memcpy = fetcher->TEST_GetNumStackBufMemcpy();
|
|
|
|
stats->num_heap_buf_memcpy = fetcher->TEST_GetNumHeapBufMemcpy();
|
|
|
|
stats->num_compressed_buf_memcpy =
|
|
|
|
fetcher->TEST_GetNumCompressedBufMemcpy();
|
|
|
|
|
|
|
|
*compresstion_type = fetcher->get_compression_type();
|
|
|
|
}
|
|
|
|
|
|
|
|
// NOTE: expected_compression_type is the expected compression
|
|
|
|
// type of the fetched block content, if the block is uncompressed,
|
|
|
|
// then the expected compression type is kNoCompression.
|
|
|
|
void FetchFirstDataBlock(const std::string& table_name, bool compressed,
|
|
|
|
bool do_uncompress,
|
|
|
|
CompressionType expected_compression_type,
|
|
|
|
MemoryAllocator* heap_buf_allocator,
|
|
|
|
MemoryAllocator* compressed_buf_allocator,
|
|
|
|
BlockContents* block, std::string* result,
|
|
|
|
MemcpyStats* memcpy_stats) {
|
|
|
|
ImmutableOptions ioptions(options_);
|
|
|
|
InternalKeyComparator comparator(options_.comparator);
|
|
|
|
FileOptions foptions(options_);
|
|
|
|
|
|
|
|
// Get block handle for the first data block.
|
|
|
|
std::unique_ptr<BlockBasedTable> table;
|
|
|
|
NewTableReader(ioptions, foptions, comparator, table_name, &table);
|
|
|
|
|
|
|
|
std::unique_ptr<BlockBasedTable::IndexReader> index_reader;
|
|
|
|
ReadOptions ro;
|
|
|
|
ASSERT_OK(BinarySearchIndexReader::Create(
|
|
|
|
table.get(), ro, nullptr /* prefetch_buffer */, false /* use_cache */,
|
|
|
|
false /* prefetch */, false /* pin */, nullptr /* lookup_context */,
|
|
|
|
&index_reader));
|
|
|
|
|
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> iter(
|
|
|
|
index_reader->NewIterator(
|
|
|
|
ReadOptions(), false /* disable_prefix_seek */, nullptr /* iter */,
|
|
|
|
nullptr /* get_context */, nullptr /* lookup_context */));
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
iter->SeekToFirst();
|
|
|
|
BlockHandle first_block_handle = iter->value().handle;
|
|
|
|
|
|
|
|
// Fetch first data block.
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file;
|
|
|
|
NewFileReader(table_name, foptions, &file);
|
|
|
|
CompressionType compression_type;
|
|
|
|
FetchBlock(file.get(), first_block_handle, BlockType::kData, compressed,
|
|
|
|
do_uncompress, heap_buf_allocator, compressed_buf_allocator,
|
|
|
|
block, memcpy_stats, &compression_type);
|
|
|
|
ASSERT_EQ(compression_type, expected_compression_type);
|
|
|
|
result->assign(block->data.ToString());
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Skip the following tests in lite mode since direct I/O is unsupported.
|
|
|
|
|
|
|
|
// Fetch index block under both direct IO and non-direct IO.
|
|
|
|
// Expects:
|
|
|
|
// the index block contents are the same for both read modes.
|
|
|
|
TEST_F(BlockFetcherTest, FetchIndexBlock) {
|
|
|
|
for (CompressionType compression : GetSupportedCompressions()) {
|
|
|
|
std::string table_name =
|
|
|
|
"FetchIndexBlock" + CompressionTypeToString(compression);
|
|
|
|
CreateTable(table_name, compression);
|
|
|
|
|
|
|
|
CountedMemoryAllocator allocator;
|
|
|
|
MemcpyStats memcpy_stats;
|
|
|
|
BlockContents indexes[NumModes];
|
|
|
|
std::string index_datas[NumModes];
|
|
|
|
for (int i = 0; i < NumModes; ++i) {
|
|
|
|
SetMode(static_cast<Mode>(i));
|
|
|
|
FetchIndexBlock(table_name, &allocator, &allocator, &memcpy_stats,
|
|
|
|
&indexes[i], &index_datas[i]);
|
|
|
|
}
|
|
|
|
for (int i = 0; i < NumModes - 1; ++i) {
|
|
|
|
AssertSameBlock(index_datas[i], index_datas[i + 1]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Data blocks are not compressed,
|
|
|
|
// fetch data block under direct IO, mmap IO,and non-direct IO.
|
|
|
|
// Expects:
|
|
|
|
// 1. in non-direct IO mode, allocate a heap buffer and memcpy the block
|
|
|
|
// into the buffer;
|
|
|
|
// 2. in direct IO mode, allocate a heap buffer and memcpy from the
|
|
|
|
// direct IO buffer to the heap buffer.
|
|
|
|
TEST_F(BlockFetcherTest, FetchUncompressedDataBlock) {
|
|
|
|
TestStats expected_non_mmap_stats = {
|
|
|
|
{
|
|
|
|
0 /* num_stack_buf_memcpy */,
|
|
|
|
1 /* num_heap_buf_memcpy */,
|
|
|
|
0 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
1 /* num_heap_buf_allocations */,
|
|
|
|
0 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
TestStats expected_mmap_stats = {{
|
|
|
|
0 /* num_stack_buf_memcpy */,
|
|
|
|
0 /* num_heap_buf_memcpy */,
|
|
|
|
0 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
0 /* num_heap_buf_allocations */,
|
|
|
|
0 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
std::array<TestStats, NumModes> expected_stats_by_mode{{
|
|
|
|
expected_non_mmap_stats /* kBufferedRead */,
|
|
|
|
expected_mmap_stats /* kBufferedMmap */,
|
|
|
|
expected_non_mmap_stats /* kDirectRead */,
|
|
|
|
}};
|
|
|
|
TestFetchDataBlock("FetchUncompressedDataBlock", false, false,
|
|
|
|
expected_stats_by_mode);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Data blocks are compressed,
|
|
|
|
// fetch data block under both direct IO and non-direct IO,
|
|
|
|
// but do not uncompress.
|
|
|
|
// Expects:
|
|
|
|
// 1. in non-direct IO mode, allocate a compressed buffer and memcpy the block
|
|
|
|
// into the buffer;
|
|
|
|
// 2. in direct IO mode, allocate a compressed buffer and memcpy from the
|
|
|
|
// direct IO buffer to the compressed buffer.
|
|
|
|
TEST_F(BlockFetcherTest, FetchCompressedDataBlock) {
|
|
|
|
TestStats expected_non_mmap_stats = {
|
|
|
|
{
|
|
|
|
0 /* num_stack_buf_memcpy */,
|
|
|
|
0 /* num_heap_buf_memcpy */,
|
|
|
|
1 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
0 /* num_heap_buf_allocations */,
|
|
|
|
1 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
TestStats expected_mmap_stats = {{
|
|
|
|
0 /* num_stack_buf_memcpy */,
|
|
|
|
0 /* num_heap_buf_memcpy */,
|
|
|
|
0 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
0 /* num_heap_buf_allocations */,
|
|
|
|
0 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
std::array<TestStats, NumModes> expected_stats_by_mode{{
|
|
|
|
expected_non_mmap_stats /* kBufferedRead */,
|
|
|
|
expected_mmap_stats /* kBufferedMmap */,
|
|
|
|
expected_non_mmap_stats /* kDirectRead */,
|
|
|
|
}};
|
|
|
|
TestFetchDataBlock("FetchCompressedDataBlock", true, false,
|
|
|
|
expected_stats_by_mode);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Data blocks are compressed,
|
|
|
|
// fetch and uncompress data block under both direct IO and non-direct IO.
|
|
|
|
// Expects:
|
|
|
|
// 1. in non-direct IO mode, since the block is small, so it's first memcpyed
|
|
|
|
// to the stack buffer, then a heap buffer is allocated and the block is
|
|
|
|
// uncompressed into the heap.
|
|
|
|
// 2. in direct IO mode mode, allocate a heap buffer, then directly uncompress
|
|
|
|
// and memcpy from the direct IO buffer to the heap buffer.
|
|
|
|
TEST_F(BlockFetcherTest, FetchAndUncompressCompressedDataBlock) {
|
|
|
|
TestStats expected_buffered_read_stats = {
|
|
|
|
{
|
|
|
|
1 /* num_stack_buf_memcpy */,
|
|
|
|
1 /* num_heap_buf_memcpy */,
|
|
|
|
0 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
1 /* num_heap_buf_allocations */,
|
|
|
|
0 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
TestStats expected_mmap_stats = {{
|
|
|
|
0 /* num_stack_buf_memcpy */,
|
|
|
|
1 /* num_heap_buf_memcpy */,
|
|
|
|
0 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
1 /* num_heap_buf_allocations */,
|
|
|
|
0 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
TestStats expected_direct_read_stats = {
|
|
|
|
{
|
|
|
|
0 /* num_stack_buf_memcpy */,
|
|
|
|
1 /* num_heap_buf_memcpy */,
|
|
|
|
0 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
1 /* num_heap_buf_allocations */,
|
|
|
|
0 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
std::array<TestStats, NumModes> expected_stats_by_mode{{
|
|
|
|
expected_buffered_read_stats,
|
|
|
|
expected_mmap_stats,
|
|
|
|
expected_direct_read_stats,
|
|
|
|
}};
|
|
|
|
TestFetchDataBlock("FetchAndUncompressCompressedDataBlock", true, true,
|
|
|
|
expected_stats_by_mode);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
return RUN_ALL_TESTS();
|
|
|
|
}
|