fork of https://github.com/oxigraph/rocksdb and https://github.com/facebook/rocksdb for nextgraph and oxigraph
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
569 lines
18 KiB
569 lines
18 KiB
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
#pragma once
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
#include <limits>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "db/merge_context.h"
|
|
#include "memtable/skiplist.h"
|
|
#include "options/db_options.h"
|
|
#include "port/port.h"
|
|
#include "rocksdb/comparator.h"
|
|
#include "rocksdb/iterator.h"
|
|
#include "rocksdb/slice.h"
|
|
#include "rocksdb/status.h"
|
|
#include "rocksdb/utilities/write_batch_with_index.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class MergeContext;
|
|
struct Options;
|
|
|
|
// when direction == forward
|
|
// * current_at_base_ <=> base_iterator > delta_iterator
|
|
// when direction == backwards
|
|
// * current_at_base_ <=> base_iterator < delta_iterator
|
|
// always:
|
|
// * equal_keys_ <=> base_iterator == delta_iterator
|
|
class BaseDeltaIterator : public Iterator {
|
|
public:
|
|
BaseDeltaIterator(Iterator* base_iterator, WBWIIterator* delta_iterator,
|
|
const Comparator* comparator,
|
|
const ReadOptions* read_options = nullptr)
|
|
: forward_(true),
|
|
current_at_base_(true),
|
|
equal_keys_(false),
|
|
status_(Status::OK()),
|
|
base_iterator_(base_iterator),
|
|
delta_iterator_(delta_iterator),
|
|
comparator_(comparator),
|
|
iterate_upper_bound_(read_options ? read_options->iterate_upper_bound
|
|
: nullptr) {}
|
|
|
|
~BaseDeltaIterator() override {}
|
|
|
|
bool Valid() const override {
|
|
return status_.ok() ? (current_at_base_ ? BaseValid() : DeltaValid())
|
|
: false;
|
|
}
|
|
|
|
void SeekToFirst() override {
|
|
forward_ = true;
|
|
base_iterator_->SeekToFirst();
|
|
delta_iterator_->SeekToFirst();
|
|
UpdateCurrent();
|
|
}
|
|
|
|
void SeekToLast() override {
|
|
forward_ = false;
|
|
base_iterator_->SeekToLast();
|
|
delta_iterator_->SeekToLast();
|
|
UpdateCurrent();
|
|
}
|
|
|
|
void Seek(const Slice& k) override {
|
|
forward_ = true;
|
|
base_iterator_->Seek(k);
|
|
delta_iterator_->Seek(k);
|
|
UpdateCurrent();
|
|
}
|
|
|
|
void SeekForPrev(const Slice& k) override {
|
|
forward_ = false;
|
|
base_iterator_->SeekForPrev(k);
|
|
delta_iterator_->SeekForPrev(k);
|
|
UpdateCurrent();
|
|
}
|
|
|
|
void Next() override {
|
|
if (!Valid()) {
|
|
status_ = Status::NotSupported("Next() on invalid iterator");
|
|
return;
|
|
}
|
|
|
|
if (!forward_) {
|
|
// Need to change direction
|
|
// if our direction was backward and we're not equal, we have two states:
|
|
// * both iterators are valid: we're already in a good state (current
|
|
// shows to smaller)
|
|
// * only one iterator is valid: we need to advance that iterator
|
|
forward_ = true;
|
|
equal_keys_ = false;
|
|
if (!BaseValid()) {
|
|
assert(DeltaValid());
|
|
base_iterator_->SeekToFirst();
|
|
} else if (!DeltaValid()) {
|
|
delta_iterator_->SeekToFirst();
|
|
} else if (current_at_base_) {
|
|
// Change delta from larger than base to smaller
|
|
AdvanceDelta();
|
|
} else {
|
|
// Change base from larger than delta to smaller
|
|
AdvanceBase();
|
|
}
|
|
if (DeltaValid() && BaseValid()) {
|
|
if (comparator_->Equal(delta_iterator_->Entry().key,
|
|
base_iterator_->key())) {
|
|
equal_keys_ = true;
|
|
}
|
|
}
|
|
}
|
|
Advance();
|
|
}
|
|
|
|
void Prev() override {
|
|
if (!Valid()) {
|
|
status_ = Status::NotSupported("Prev() on invalid iterator");
|
|
return;
|
|
}
|
|
|
|
if (forward_) {
|
|
// Need to change direction
|
|
// if our direction was backward and we're not equal, we have two states:
|
|
// * both iterators are valid: we're already in a good state (current
|
|
// shows to smaller)
|
|
// * only one iterator is valid: we need to advance that iterator
|
|
forward_ = false;
|
|
equal_keys_ = false;
|
|
if (!BaseValid()) {
|
|
assert(DeltaValid());
|
|
base_iterator_->SeekToLast();
|
|
} else if (!DeltaValid()) {
|
|
delta_iterator_->SeekToLast();
|
|
} else if (current_at_base_) {
|
|
// Change delta from less advanced than base to more advanced
|
|
AdvanceDelta();
|
|
} else {
|
|
// Change base from less advanced than delta to more advanced
|
|
AdvanceBase();
|
|
}
|
|
if (DeltaValid() && BaseValid()) {
|
|
if (comparator_->Equal(delta_iterator_->Entry().key,
|
|
base_iterator_->key())) {
|
|
equal_keys_ = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
Advance();
|
|
}
|
|
|
|
Slice key() const override {
|
|
return current_at_base_ ? base_iterator_->key()
|
|
: delta_iterator_->Entry().key;
|
|
}
|
|
|
|
Slice value() const override {
|
|
return current_at_base_ ? base_iterator_->value()
|
|
: delta_iterator_->Entry().value;
|
|
}
|
|
|
|
Status status() const override {
|
|
if (!status_.ok()) {
|
|
return status_;
|
|
}
|
|
if (!base_iterator_->status().ok()) {
|
|
return base_iterator_->status();
|
|
}
|
|
return delta_iterator_->status();
|
|
}
|
|
|
|
void Invalidate(Status s) { status_ = s; }
|
|
|
|
private:
|
|
void AssertInvariants() {
|
|
#ifndef NDEBUG
|
|
bool not_ok = false;
|
|
if (!base_iterator_->status().ok()) {
|
|
assert(!base_iterator_->Valid());
|
|
not_ok = true;
|
|
}
|
|
if (!delta_iterator_->status().ok()) {
|
|
assert(!delta_iterator_->Valid());
|
|
not_ok = true;
|
|
}
|
|
if (not_ok) {
|
|
assert(!Valid());
|
|
assert(!status().ok());
|
|
return;
|
|
}
|
|
|
|
if (!Valid()) {
|
|
return;
|
|
}
|
|
if (!BaseValid()) {
|
|
assert(!current_at_base_ && delta_iterator_->Valid());
|
|
return;
|
|
}
|
|
if (!DeltaValid()) {
|
|
assert(current_at_base_ && base_iterator_->Valid());
|
|
return;
|
|
}
|
|
// we don't support those yet
|
|
assert(delta_iterator_->Entry().type != kMergeRecord &&
|
|
delta_iterator_->Entry().type != kLogDataRecord);
|
|
int compare = comparator_->Compare(delta_iterator_->Entry().key,
|
|
base_iterator_->key());
|
|
if (forward_) {
|
|
// current_at_base -> compare < 0
|
|
assert(!current_at_base_ || compare < 0);
|
|
// !current_at_base -> compare <= 0
|
|
assert(current_at_base_ && compare >= 0);
|
|
} else {
|
|
// current_at_base -> compare > 0
|
|
assert(!current_at_base_ || compare > 0);
|
|
// !current_at_base -> compare <= 0
|
|
assert(current_at_base_ && compare <= 0);
|
|
}
|
|
// equal_keys_ <=> compare == 0
|
|
assert((equal_keys_ || compare != 0) && (!equal_keys_ || compare == 0));
|
|
#endif
|
|
}
|
|
|
|
void Advance() {
|
|
if (equal_keys_) {
|
|
assert(BaseValid() && DeltaValid());
|
|
AdvanceBase();
|
|
AdvanceDelta();
|
|
} else {
|
|
if (current_at_base_) {
|
|
assert(BaseValid());
|
|
AdvanceBase();
|
|
} else {
|
|
assert(DeltaValid());
|
|
AdvanceDelta();
|
|
}
|
|
}
|
|
UpdateCurrent();
|
|
}
|
|
|
|
void AdvanceDelta() {
|
|
if (forward_) {
|
|
delta_iterator_->Next();
|
|
} else {
|
|
delta_iterator_->Prev();
|
|
}
|
|
}
|
|
void AdvanceBase() {
|
|
if (forward_) {
|
|
base_iterator_->Next();
|
|
} else {
|
|
base_iterator_->Prev();
|
|
}
|
|
}
|
|
bool BaseValid() const { return base_iterator_->Valid(); }
|
|
bool DeltaValid() const { return delta_iterator_->Valid(); }
|
|
void UpdateCurrent() {
|
|
// Suppress false positive clang analyzer warnings.
|
|
#ifndef __clang_analyzer__
|
|
status_ = Status::OK();
|
|
while (true) {
|
|
WriteEntry delta_entry;
|
|
if (DeltaValid()) {
|
|
assert(delta_iterator_->status().ok());
|
|
delta_entry = delta_iterator_->Entry();
|
|
} else if (!delta_iterator_->status().ok()) {
|
|
// Expose the error status and stop.
|
|
current_at_base_ = false;
|
|
return;
|
|
}
|
|
equal_keys_ = false;
|
|
if (!BaseValid()) {
|
|
if (!base_iterator_->status().ok()) {
|
|
// Expose the error status and stop.
|
|
current_at_base_ = true;
|
|
return;
|
|
}
|
|
|
|
// Base has finished.
|
|
if (!DeltaValid()) {
|
|
// Finished
|
|
return;
|
|
}
|
|
if (iterate_upper_bound_) {
|
|
if (comparator_->Compare(delta_entry.key, *iterate_upper_bound_) >=
|
|
0) {
|
|
// out of upper bound -> finished.
|
|
return;
|
|
}
|
|
}
|
|
if (delta_entry.type == kDeleteRecord ||
|
|
delta_entry.type == kSingleDeleteRecord) {
|
|
AdvanceDelta();
|
|
} else {
|
|
current_at_base_ = false;
|
|
return;
|
|
}
|
|
} else if (!DeltaValid()) {
|
|
// Delta has finished.
|
|
current_at_base_ = true;
|
|
return;
|
|
} else {
|
|
int compare =
|
|
(forward_ ? 1 : -1) *
|
|
comparator_->Compare(delta_entry.key, base_iterator_->key());
|
|
if (compare <= 0) { // delta bigger or equal
|
|
if (compare == 0) {
|
|
equal_keys_ = true;
|
|
}
|
|
if (delta_entry.type != kDeleteRecord &&
|
|
delta_entry.type != kSingleDeleteRecord) {
|
|
current_at_base_ = false;
|
|
return;
|
|
}
|
|
// Delta is less advanced and is delete.
|
|
AdvanceDelta();
|
|
if (equal_keys_) {
|
|
AdvanceBase();
|
|
}
|
|
} else {
|
|
current_at_base_ = true;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
AssertInvariants();
|
|
#endif // __clang_analyzer__
|
|
}
|
|
|
|
bool forward_;
|
|
bool current_at_base_;
|
|
bool equal_keys_;
|
|
Status status_;
|
|
std::unique_ptr<Iterator> base_iterator_;
|
|
std::unique_ptr<WBWIIterator> delta_iterator_;
|
|
const Comparator* comparator_; // not owned
|
|
const Slice* iterate_upper_bound_;
|
|
};
|
|
|
|
// Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
|
|
struct WriteBatchIndexEntry {
|
|
WriteBatchIndexEntry(size_t o, uint32_t c, size_t ko, size_t ksz)
|
|
: offset(o),
|
|
column_family(c),
|
|
key_offset(ko),
|
|
key_size(ksz),
|
|
search_key(nullptr) {}
|
|
// Create a dummy entry as the search key. This index entry won't be backed
|
|
// by an entry from the write batch, but a pointer to the search key. Or a
|
|
// special flag of offset can indicate we are seek to first.
|
|
// @_search_key: the search key
|
|
// @_column_family: column family
|
|
// @is_forward_direction: true for Seek(). False for SeekForPrev()
|
|
// @is_seek_to_first: true if we seek to the beginning of the column family
|
|
// _search_key should be null in this case.
|
|
WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family,
|
|
bool is_forward_direction, bool is_seek_to_first)
|
|
// For SeekForPrev(), we need to make the dummy entry larger than any
|
|
// entry who has the same search key. Otherwise, we'll miss those entries.
|
|
: offset(is_forward_direction ? 0 : port::kMaxSizet),
|
|
column_family(_column_family),
|
|
key_offset(0),
|
|
key_size(is_seek_to_first ? kFlagMinInCf : 0),
|
|
search_key(_search_key) {
|
|
assert(_search_key != nullptr || is_seek_to_first);
|
|
}
|
|
|
|
// If this flag appears in the key_size, it indicates a
|
|
// key that is smaller than any other entry for the same column family.
|
|
static const size_t kFlagMinInCf = port::kMaxSizet;
|
|
|
|
bool is_min_in_cf() const {
|
|
assert(key_size != kFlagMinInCf ||
|
|
(key_offset == 0 && search_key == nullptr));
|
|
return key_size == kFlagMinInCf;
|
|
}
|
|
|
|
// offset of an entry in write batch's string buffer. If this is a dummy
|
|
// lookup key, in which case search_key != nullptr, offset is set to either
|
|
// 0 or max, only for comparison purpose. Because when entries have the same
|
|
// key, the entry with larger offset is larger, offset = 0 will make a seek
|
|
// key small or equal than all the entries with the seek key, so that Seek()
|
|
// will find all the entries of the same key. Similarly, offset = MAX will
|
|
// make the entry just larger than all entries with the search key so
|
|
// SeekForPrev() will see all the keys with the same key.
|
|
size_t offset;
|
|
uint32_t column_family; // c1olumn family of the entry.
|
|
size_t key_offset; // offset of the key in write batch's string buffer.
|
|
size_t key_size; // size of the key. kFlagMinInCf indicates
|
|
// that this is a dummy look up entry for
|
|
// SeekToFirst() to the beginning of the column
|
|
// family. We use the flag here to save a boolean
|
|
// in the struct.
|
|
|
|
const Slice* search_key; // if not null, instead of reading keys from
|
|
// write batch, use it to compare. This is used
|
|
// for lookup key.
|
|
};
|
|
|
|
class ReadableWriteBatch : public WriteBatch {
|
|
public:
|
|
explicit ReadableWriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0)
|
|
: WriteBatch(reserved_bytes, max_bytes) {}
|
|
// Retrieve some information from a write entry in the write batch, given
|
|
// the start offset of the write entry.
|
|
Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key,
|
|
Slice* value, Slice* blob, Slice* xid) const;
|
|
};
|
|
|
|
class WriteBatchEntryComparator {
|
|
public:
|
|
WriteBatchEntryComparator(const Comparator* _default_comparator,
|
|
const ReadableWriteBatch* write_batch)
|
|
: default_comparator_(_default_comparator), write_batch_(write_batch) {}
|
|
// Compare a and b. Return a negative value if a is less than b, 0 if they
|
|
// are equal, and a positive value if a is greater than b
|
|
int operator()(const WriteBatchIndexEntry* entry1,
|
|
const WriteBatchIndexEntry* entry2) const;
|
|
|
|
int CompareKey(uint32_t column_family, const Slice& key1,
|
|
const Slice& key2) const;
|
|
|
|
void SetComparatorForCF(uint32_t column_family_id,
|
|
const Comparator* comparator) {
|
|
if (column_family_id >= cf_comparators_.size()) {
|
|
cf_comparators_.resize(column_family_id + 1, nullptr);
|
|
}
|
|
cf_comparators_[column_family_id] = comparator;
|
|
}
|
|
|
|
const Comparator* default_comparator() { return default_comparator_; }
|
|
|
|
private:
|
|
const Comparator* default_comparator_;
|
|
std::vector<const Comparator*> cf_comparators_;
|
|
const ReadableWriteBatch* write_batch_;
|
|
};
|
|
|
|
typedef SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>
|
|
WriteBatchEntrySkipList;
|
|
|
|
class WBWIIteratorImpl : public WBWIIterator {
|
|
public:
|
|
WBWIIteratorImpl(uint32_t column_family_id,
|
|
WriteBatchEntrySkipList* skip_list,
|
|
const ReadableWriteBatch* write_batch,
|
|
WriteBatchEntryComparator* comparator)
|
|
: column_family_id_(column_family_id),
|
|
skip_list_iter_(skip_list),
|
|
write_batch_(write_batch),
|
|
comparator_(comparator) {}
|
|
|
|
~WBWIIteratorImpl() override {}
|
|
|
|
bool Valid() const override {
|
|
if (!skip_list_iter_.Valid()) {
|
|
return false;
|
|
}
|
|
const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
|
|
return (iter_entry != nullptr &&
|
|
iter_entry->column_family == column_family_id_);
|
|
}
|
|
|
|
void SeekToFirst() override {
|
|
WriteBatchIndexEntry search_entry(
|
|
nullptr /* search_key */, column_family_id_,
|
|
true /* is_forward_direction */, true /* is_seek_to_first */);
|
|
skip_list_iter_.Seek(&search_entry);
|
|
}
|
|
|
|
void SeekToLast() override {
|
|
WriteBatchIndexEntry search_entry(
|
|
nullptr /* search_key */, column_family_id_ + 1,
|
|
true /* is_forward_direction */, true /* is_seek_to_first */);
|
|
skip_list_iter_.Seek(&search_entry);
|
|
if (!skip_list_iter_.Valid()) {
|
|
skip_list_iter_.SeekToLast();
|
|
} else {
|
|
skip_list_iter_.Prev();
|
|
}
|
|
}
|
|
|
|
void Seek(const Slice& key) override {
|
|
WriteBatchIndexEntry search_entry(&key, column_family_id_,
|
|
true /* is_forward_direction */,
|
|
false /* is_seek_to_first */);
|
|
skip_list_iter_.Seek(&search_entry);
|
|
}
|
|
|
|
void SeekForPrev(const Slice& key) override {
|
|
WriteBatchIndexEntry search_entry(&key, column_family_id_,
|
|
false /* is_forward_direction */,
|
|
false /* is_seek_to_first */);
|
|
skip_list_iter_.SeekForPrev(&search_entry);
|
|
}
|
|
|
|
void Next() override { skip_list_iter_.Next(); }
|
|
|
|
void Prev() override { skip_list_iter_.Prev(); }
|
|
|
|
WriteEntry Entry() const override;
|
|
|
|
Status status() const override {
|
|
// this is in-memory data structure, so the only way status can be non-ok is
|
|
// through memory corruption
|
|
return Status::OK();
|
|
}
|
|
|
|
const WriteBatchIndexEntry* GetRawEntry() const {
|
|
return skip_list_iter_.key();
|
|
}
|
|
|
|
bool MatchesKey(uint32_t cf_id, const Slice& key);
|
|
|
|
private:
|
|
uint32_t column_family_id_;
|
|
WriteBatchEntrySkipList::Iterator skip_list_iter_;
|
|
const ReadableWriteBatch* write_batch_;
|
|
WriteBatchEntryComparator* comparator_;
|
|
};
|
|
|
|
class WriteBatchWithIndexInternal {
|
|
public:
|
|
// For GetFromBatchAndDB or similar
|
|
explicit WriteBatchWithIndexInternal(DB* db,
|
|
ColumnFamilyHandle* column_family);
|
|
// For GetFromBatch or similar
|
|
explicit WriteBatchWithIndexInternal(const DBOptions* db_options,
|
|
ColumnFamilyHandle* column_family);
|
|
|
|
enum Result { kFound, kDeleted, kNotFound, kMergeInProgress, kError };
|
|
|
|
// If batch contains a value for key, store it in *value and return kFound.
|
|
// If batch contains a deletion for key, return Deleted.
|
|
// If batch contains Merge operations as the most recent entry for a key,
|
|
// and the merge process does not stop (not reaching a value or delete),
|
|
// prepend the current merge operands to *operands,
|
|
// and return kMergeInProgress
|
|
// If batch does not contain this key, return kNotFound
|
|
// Else, return kError on error with error Status stored in *s.
|
|
Result GetFromBatch(WriteBatchWithIndex* batch, const Slice& key,
|
|
std::string* value, bool overwrite_key, Status* s) {
|
|
return GetFromBatch(batch, key, &merge_context_, value, overwrite_key, s);
|
|
}
|
|
Result GetFromBatch(WriteBatchWithIndex* batch, const Slice& key,
|
|
MergeContext* merge_context, std::string* value,
|
|
bool overwrite_key, Status* s);
|
|
Status MergeKey(const Slice& key, const Slice* value, std::string* result,
|
|
Slice* result_operand = nullptr) {
|
|
return MergeKey(key, value, merge_context_, result, result_operand);
|
|
}
|
|
Status MergeKey(const Slice& key, const Slice* value, MergeContext& context,
|
|
std::string* result, Slice* result_operand = nullptr);
|
|
|
|
private:
|
|
DB* db_;
|
|
const DBOptions* db_options_;
|
|
ColumnFamilyHandle* column_family_;
|
|
MergeContext merge_context_;
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
#endif // !ROCKSDB_LITE
|
|
|