|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#include "db/db_impl/db_impl_readonly.h"
|
|
|
|
#include "db/arena_wrapped_db_iter.h"
|
|
|
|
|
|
|
|
#include "db/compacted_db_impl.h"
|
|
|
|
#include "db/db_impl/db_impl.h"
|
|
|
|
#include "db/db_iter.h"
|
|
|
|
#include "db/merge_context.h"
|
|
|
|
#include "monitoring/perf_context_imp.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
|
|
|
|
DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
|
|
|
|
const std::string& dbname)
|
|
|
|
: DBImpl(db_options, dbname) {
|
|
|
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
|
|
"Opening the db in read only mode");
|
|
|
|
LogFlush(immutable_db_options_.info_log);
|
|
|
|
}
|
|
|
|
|
|
|
|
DBImplReadOnly::~DBImplReadOnly() {}
|
|
|
|
|
|
|
|
// Implementations of the DB interface
|
|
|
|
Status DBImplReadOnly::Get(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
PinnableSlice* pinnable_val) {
|
|
|
|
assert(pinnable_val != nullptr);
|
|
|
|
// TODO: stopwatch DB_GET needed?, perf timer needed?
|
|
|
|
PERF_TIMER_GUARD(get_snapshot_time);
|
|
|
|
Status s;
|
|
|
|
SequenceNumber snapshot = versions_->LastSequence();
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
if (tracer_) {
|
|
|
|
InstrumentedMutexLock lock(&trace_mutex_);
|
|
|
|
if (tracer_) {
|
|
|
|
tracer_->Get(column_family, key);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
SuperVersion* super_version = cfd->GetSuperVersion();
|
|
|
|
MergeContext merge_context;
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
6 years ago
|
|
|
SequenceNumber max_covering_tombstone_seq = 0;
|
|
|
|
LookupKey lkey(key, snapshot);
|
|
|
|
PERF_TIMER_STOP(get_snapshot_time);
|
|
|
|
if (super_version->mem->Get(lkey, pinnable_val->GetSelf(),
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
6 years ago
|
|
|
&max_covering_tombstone_seq, read_options)) {
|
|
|
|
pinnable_val->PinSelf();
|
|
|
|
RecordTick(stats_, MEMTABLE_HIT);
|
|
|
|
} else {
|
|
|
|
PERF_TIMER_GUARD(get_from_output_files_time);
|
|
|
|
super_version->current->Get(read_options, lkey, pinnable_val,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq);
|
|
|
|
RecordTick(stats_, MEMTABLE_MISS);
|
|
|
|
}
|
|
|
|
RecordTick(stats_, NUMBER_KEYS_READ);
|
|
|
|
size_t size = pinnable_val->size();
|
|
|
|
RecordTick(stats_, BYTES_READ, size);
|
|
|
|
RecordInHistogram(stats_, BYTES_PER_READ, size);
|
|
|
|
PERF_COUNTER_ADD(get_read_bytes, size);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
|
|
|
|
SequenceNumber latest_snapshot = versions_->LastSequence();
|
|
|
|
SequenceNumber read_seq =
|
|
|
|
read_options.snapshot != nullptr
|
|
|
|
? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
|
|
|
|
->number_
|
|
|
|
: latest_snapshot;
|
|
|
|
ReadCallback* read_callback = nullptr; // No read callback provided.
|
|
|
|
auto db_iter = NewArenaWrappedDbIterator(
|
|
|
|
env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
|
|
|
|
read_seq,
|
|
|
|
super_version->mutable_cf_options.max_sequential_skip_in_iterations,
|
|
|
|
super_version->version_number, read_callback);
|
|
|
|
auto internal_iter =
|
|
|
|
NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(),
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
db_iter->GetRangeDelAggregator(), read_seq,
|
|
|
|
/* allow_unprepared_value */ true);
|
|
|
|
db_iter->SetIterUnderDBIter(internal_iter);
|
|
|
|
return db_iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImplReadOnly::NewIterators(
|
|
|
|
const ReadOptions& read_options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_families,
|
|
|
|
std::vector<Iterator*>* iterators) {
|
|
|
|
ReadCallback* read_callback = nullptr; // No read callback provided.
|
|
|
|
if (iterators == nullptr) {
|
|
|
|
return Status::InvalidArgument("iterators not allowed to be nullptr");
|
|
|
|
}
|
|
|
|
iterators->clear();
|
|
|
|
iterators->reserve(column_families.size());
|
|
|
|
SequenceNumber latest_snapshot = versions_->LastSequence();
|
|
|
|
SequenceNumber read_seq =
|
|
|
|
read_options.snapshot != nullptr
|
|
|
|
? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
|
|
|
|
->number_
|
|
|
|
: latest_snapshot;
|
|
|
|
|
|
|
|
for (auto cfh : column_families) {
|
|
|
|
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
|
|
|
|
auto* sv = cfd->GetSuperVersion()->Ref();
|
|
|
|
auto* db_iter = NewArenaWrappedDbIterator(
|
|
|
|
env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, read_seq,
|
|
|
|
sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
|
|
|
sv->version_number, read_callback);
|
|
|
|
auto* internal_iter =
|
|
|
|
NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
db_iter->GetRangeDelAggregator(), read_seq,
|
|
|
|
/* allow_unprepared_value */ true);
|
|
|
|
db_iter->SetIterUnderDBIter(internal_iter);
|
|
|
|
iterators->push_back(db_iter);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
// Return OK if dbname exists in the file system
|
|
|
|
// or create_if_missing is false
|
|
|
|
Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
|
|
|
|
const std::string& dbname) {
|
|
|
|
Status s;
|
|
|
|
if (!db_options.create_if_missing) {
|
|
|
|
// Attempt to read "CURRENT" file
|
|
|
|
const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
|
|
|
|
std::string manifest_path;
|
|
|
|
uint64_t manifest_file_number;
|
|
|
|
s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path,
|
|
|
|
&manifest_file_number);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return Status::NotFound(CurrentFileName(dbname), "does not exist");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
|
|
|
|
DB** dbptr, bool /*error_if_log_file_exist*/) {
|
|
|
|
// If dbname does not exist in the file system, should not do anything
|
|
|
|
Status s = OpenForReadOnlyCheckExistence(options, dbname);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
*dbptr = nullptr;
|
|
|
|
|
|
|
|
// Try to first open DB as fully compacted DB
|
|
|
|
s = CompactedDBImpl::Open(options, dbname, dbptr);
|
|
|
|
if (s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
DBOptions db_options(options);
|
|
|
|
ColumnFamilyOptions cf_options(options);
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
column_families.push_back(
|
|
|
|
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
|
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
|
|
|
|
s = DBImplReadOnly::OpenForReadOnlyWithoutCheck(
|
|
|
|
db_options, dbname, column_families, &handles, dbptr);
|
|
|
|
if (s.ok()) {
|
|
|
|
assert(handles.size() == 1);
|
|
|
|
// i can delete the handle since DBImpl is always holding a
|
|
|
|
// reference to default column family
|
|
|
|
delete handles[0];
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::OpenForReadOnly(
|
|
|
|
const DBOptions& db_options, const std::string& dbname,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
|
|
|
|
bool error_if_log_file_exist) {
|
|
|
|
// If dbname does not exist in the file system, should not do anything
|
|
|
|
Status s = OpenForReadOnlyCheckExistence(db_options, dbname);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
return DBImplReadOnly::OpenForReadOnlyWithoutCheck(
|
|
|
|
db_options, dbname, column_families, handles, dbptr,
|
|
|
|
error_if_log_file_exist);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImplReadOnly::OpenForReadOnlyWithoutCheck(
|
|
|
|
const DBOptions& db_options, const std::string& dbname,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
|
|
|
|
bool error_if_log_file_exist) {
|
|
|
|
*dbptr = nullptr;
|
|
|
|
handles->clear();
|
|
|
|
|
|
|
|
SuperVersionContext sv_context(/* create_superversion */ true);
|
|
|
|
DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
|
|
|
|
impl->mutex_.Lock();
|
|
|
|
Status s = impl->Recover(column_families, true /* read only */,
|
|
|
|
error_if_log_file_exist);
|
|
|
|
if (s.ok()) {
|
|
|
|
// set column family handles
|
|
|
|
for (auto cf : column_families) {
|
|
|
|
auto cfd =
|
|
|
|
impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
|
|
|
|
if (cfd == nullptr) {
|
|
|
|
s = Status::InvalidArgument("Column family not found: ", cf.name);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
|
|
|
|
sv_context.NewSuperVersion();
|
|
|
|
cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
impl->mutex_.Unlock();
|
|
|
|
sv_context.Clean();
|
|
|
|
if (s.ok()) {
|
|
|
|
*dbptr = impl;
|
|
|
|
for (auto* h : *handles) {
|
|
|
|
impl->NewThreadStatusCfInfo(
|
|
|
|
reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (auto h : *handles) {
|
|
|
|
delete h;
|
|
|
|
}
|
|
|
|
handles->clear();
|
|
|
|
delete impl;
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
#else // !ROCKSDB_LITE
|
|
|
|
|
|
|
|
Status DB::OpenForReadOnly(const Options& /*options*/,
|
|
|
|
const std::string& /*dbname*/, DB** /*dbptr*/,
|
|
|
|
bool /*error_if_log_file_exist*/) {
|
|
|
|
return Status::NotSupported("Not supported in ROCKSDB_LITE.");
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::OpenForReadOnly(
|
|
|
|
const DBOptions& /*db_options*/, const std::string& /*dbname*/,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
|
|
|
|
std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/,
|
|
|
|
bool /*error_if_log_file_exist*/) {
|
|
|
|
return Status::NotSupported("Not supported in ROCKSDB_LITE.");
|
|
|
|
}
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|