|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
|
|
#include <memory>
|
|
|
|
#include <string>
|
|
|
|
|
|
|
|
#include "cache/cache_reservation_manager.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "rocksdb/flush_block_policy.h"
|
|
|
|
#include "rocksdb/table.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
struct ColumnFamilyOptions;
|
|
|
|
struct ConfigOptions;
|
|
|
|
struct DBOptions;
|
|
|
|
struct EnvOptions;
|
|
|
|
|
|
|
|
class BlockBasedTableBuilder;
|
|
|
|
class RandomAccessFileReader;
|
|
|
|
class WritableFileWriter;
|
|
|
|
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2 years ago
|
|
|
// TODO: deprecate this class as it can be replaced with
|
|
|
|
// `FileMetaData::tail_size`
|
|
|
|
//
|
|
|
|
// A class used to track actual bytes written from the tail in the recent SST
|
|
|
|
// file opens, and provide a suggestion for following open.
|
|
|
|
class TailPrefetchStats {
|
|
|
|
public:
|
|
|
|
void RecordEffectiveSize(size_t len);
|
|
|
|
// 0 indicates no information to determine.
|
|
|
|
size_t GetSuggestedPrefetchSize();
|
|
|
|
|
|
|
|
private:
|
|
|
|
const static size_t kNumTracked = 32;
|
|
|
|
size_t records_[kNumTracked];
|
|
|
|
port::Mutex mutex_;
|
|
|
|
size_t next_ = 0;
|
|
|
|
size_t num_records_ = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
class BlockBasedTableFactory : public TableFactory {
|
|
|
|
public:
|
|
|
|
explicit BlockBasedTableFactory(
|
|
|
|
const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
|
|
|
|
|
|
|
|
~BlockBasedTableFactory() {}
|
|
|
|
|
|
|
|
// Method to allow CheckedCast to work for this class
|
|
|
|
static const char* kClassName() { return kBlockBasedTableName(); }
|
|
|
|
|
|
|
|
const char* Name() const override { return kBlockBasedTableName(); }
|
|
|
|
|
|
|
|
using TableFactory::NewTableReader;
|
|
|
|
Status NewTableReader(
|
|
|
|
const ReadOptions& ro, const TableReaderOptions& table_reader_options,
|
|
|
|
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
|
|
|
|
std::unique_ptr<TableReader>* table_reader,
|
|
|
|
bool prefetch_index_and_filter_in_cache = true) const override;
|
|
|
|
|
|
|
|
TableBuilder* NewTableBuilder(
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
10 years ago
|
|
|
const TableBuilderOptions& table_builder_options,
|
|
|
|
WritableFileWriter* file) const override;
|
|
|
|
|
|
|
|
// Valdates the specified DB Options.
|
|
|
|
Status ValidateOptions(const DBOptions& db_opts,
|
|
|
|
const ColumnFamilyOptions& cf_opts) const override;
|
|
|
|
Status PrepareOptions(const ConfigOptions& opts) override;
|
|
|
|
|
|
|
|
std::string GetPrintableOptions() const override;
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
9 years ago
|
|
|
|
|
|
|
bool IsDeleteRangeSupported() const override { return true; }
|
|
|
|
|
|
|
|
TailPrefetchStats* tail_prefetch_stats() { return &tail_prefetch_stats_; }
|
|
|
|
|
|
|
|
protected:
|
|
|
|
const void* GetOptionsPtr(const std::string& name) const override;
|
|
|
|
Status ParseOption(const ConfigOptions& config_options,
|
|
|
|
const OptionTypeInfo& opt_info,
|
|
|
|
const std::string& opt_name, const std::string& opt_value,
|
|
|
|
void* opt_ptr) override;
|
|
|
|
void InitializeOptions();
|
|
|
|
|
|
|
|
private:
|
|
|
|
BlockBasedTableOptions table_options_;
|
|
|
|
std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr_;
|
|
|
|
mutable TailPrefetchStats tail_prefetch_stats_;
|
|
|
|
};
|
|
|
|
|
|
|
|
extern const std::string kHashIndexPrefixesBlock;
|
|
|
|
extern const std::string kHashIndexPrefixesMetadataBlock;
|
|
|
|
extern const std::string kPropTrue;
|
|
|
|
extern const std::string kPropFalse;
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|