Summary: Added a couple functions to WriteBatchWithIndex to make it easier to query the value of a key including reading pending writes from a batch. (This is needed for transactions). I created write_batch_with_index_internal.h to use to store an internal-only helper function since there wasn't a good place in the existing class hierarchy to store this function (and it didn't seem right to stick this function inside WriteBatchInternal::Rep). Since I needed to access the WriteBatchEntryComparator, I moved some helper classes from write_batch_with_index.cc into write_batch_with_index_internal.h/.cc. WriteBatchIndexEntry, ReadableWriteBatch, and WriteBatchEntryComparator are all unchanged (just moved to a different file(s)). Test Plan: Added new unit tests. Reviewers: rven, yhchiang, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D38037main
parent
3996fff8a1
commit
711465ccec
@ -0,0 +1,242 @@ |
|||||||
|
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
#include "db/column_family.h" |
||||||
|
#include "db/merge_context.h" |
||||||
|
#include "db/merge_helper.h" |
||||||
|
#include "rocksdb/comparator.h" |
||||||
|
#include "rocksdb/db.h" |
||||||
|
#include "rocksdb/utilities/write_batch_with_index.h" |
||||||
|
#include "util/coding.h" |
||||||
|
#include "utilities/write_batch_with_index/write_batch_with_index_internal.h" |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
|
||||||
|
class Env; |
||||||
|
class Logger; |
||||||
|
class Statistics; |
||||||
|
|
||||||
|
Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset, |
||||||
|
WriteType* type, Slice* Key, |
||||||
|
Slice* value, |
||||||
|
Slice* blob) const { |
||||||
|
if (type == nullptr || Key == nullptr || value == nullptr || |
||||||
|
blob == nullptr) { |
||||||
|
return Status::InvalidArgument("Output parameters cannot be null"); |
||||||
|
} |
||||||
|
|
||||||
|
if (data_offset >= GetDataSize()) { |
||||||
|
return Status::InvalidArgument("data offset exceed write batch size"); |
||||||
|
} |
||||||
|
Slice input = Slice(rep_.data() + data_offset, rep_.size() - data_offset); |
||||||
|
char tag; |
||||||
|
uint32_t column_family; |
||||||
|
Status s = |
||||||
|
ReadRecordFromWriteBatch(&input, &tag, &column_family, Key, value, blob); |
||||||
|
|
||||||
|
switch (tag) { |
||||||
|
case kTypeColumnFamilyValue: |
||||||
|
case kTypeValue: |
||||||
|
*type = kPutRecord; |
||||||
|
break; |
||||||
|
case kTypeColumnFamilyDeletion: |
||||||
|
case kTypeDeletion: |
||||||
|
*type = kDeleteRecord; |
||||||
|
break; |
||||||
|
case kTypeColumnFamilyMerge: |
||||||
|
case kTypeMerge: |
||||||
|
*type = kMergeRecord; |
||||||
|
break; |
||||||
|
case kTypeLogData: |
||||||
|
*type = kLogDataRecord; |
||||||
|
break; |
||||||
|
default: |
||||||
|
return Status::Corruption("unknown WriteBatch tag"); |
||||||
|
} |
||||||
|
return Status::OK(); |
||||||
|
} |
||||||
|
|
||||||
|
int WriteBatchEntryComparator::operator()( |
||||||
|
const WriteBatchIndexEntry* entry1, |
||||||
|
const WriteBatchIndexEntry* entry2) const { |
||||||
|
if (entry1->column_family > entry2->column_family) { |
||||||
|
return 1; |
||||||
|
} else if (entry1->column_family < entry2->column_family) { |
||||||
|
return -1; |
||||||
|
} |
||||||
|
|
||||||
|
if (entry1->offset == WriteBatchIndexEntry::kFlagMin) { |
||||||
|
return -1; |
||||||
|
} else if (entry2->offset == WriteBatchIndexEntry::kFlagMin) { |
||||||
|
return 1; |
||||||
|
} |
||||||
|
|
||||||
|
Status s; |
||||||
|
Slice key1, key2; |
||||||
|
if (entry1->search_key == nullptr) { |
||||||
|
Slice value, blob; |
||||||
|
WriteType write_type; |
||||||
|
s = write_batch_->GetEntryFromDataOffset(entry1->offset, &write_type, &key1, |
||||||
|
&value, &blob); |
||||||
|
if (!s.ok()) { |
||||||
|
return 1; |
||||||
|
} |
||||||
|
} else { |
||||||
|
key1 = *(entry1->search_key); |
||||||
|
} |
||||||
|
if (entry2->search_key == nullptr) { |
||||||
|
Slice value, blob; |
||||||
|
WriteType write_type; |
||||||
|
s = write_batch_->GetEntryFromDataOffset(entry2->offset, &write_type, &key2, |
||||||
|
&value, &blob); |
||||||
|
if (!s.ok()) { |
||||||
|
return -1; |
||||||
|
} |
||||||
|
} else { |
||||||
|
key2 = *(entry2->search_key); |
||||||
|
} |
||||||
|
|
||||||
|
int cmp = CompareKey(entry1->column_family, key1, key2); |
||||||
|
if (cmp != 0) { |
||||||
|
return cmp; |
||||||
|
} else if (entry1->offset > entry2->offset) { |
||||||
|
return 1; |
||||||
|
} else if (entry1->offset < entry2->offset) { |
||||||
|
return -1; |
||||||
|
} |
||||||
|
return 0; |
||||||
|
} |
||||||
|
|
||||||
|
int WriteBatchEntryComparator::CompareKey(uint32_t column_family, |
||||||
|
const Slice& key1, |
||||||
|
const Slice& key2) const { |
||||||
|
auto comparator_for_cf = cf_comparator_map_.find(column_family); |
||||||
|
if (comparator_for_cf != cf_comparator_map_.end()) { |
||||||
|
return comparator_for_cf->second->Compare(key1, key2); |
||||||
|
} else { |
||||||
|
return default_comparator_->Compare(key1, key2); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
WriteBatchWithIndexInternal::Result WriteBatchWithIndexInternal::GetFromBatch( |
||||||
|
const DBOptions& options, WriteBatchWithIndex* batch, |
||||||
|
ColumnFamilyHandle* column_family, const Slice& key, |
||||||
|
MergeContext* merge_context, WriteBatchEntryComparator* cmp, |
||||||
|
std::string* value, Status* s) { |
||||||
|
uint32_t cf_id = GetColumnFamilyID(column_family); |
||||||
|
*s = Status::OK(); |
||||||
|
WriteBatchWithIndexInternal::Result result = |
||||||
|
WriteBatchWithIndexInternal::Result::kNotFound; |
||||||
|
|
||||||
|
std::unique_ptr<WBWIIterator> iter = |
||||||
|
std::unique_ptr<WBWIIterator>(batch->NewIterator(column_family)); |
||||||
|
|
||||||
|
// We want to iterate in the reverse order that the writes were added to the
|
||||||
|
// batch. Since we don't have a reverse iterator, we must seek past the end.
|
||||||
|
// TODO(agiardullo): consider adding support for reverse iteration
|
||||||
|
iter->Seek(key); |
||||||
|
while (iter->Valid()) { |
||||||
|
const WriteEntry& entry = iter->Entry(); |
||||||
|
if (cmp->CompareKey(cf_id, entry.key, key) != 0) { |
||||||
|
break; |
||||||
|
} |
||||||
|
|
||||||
|
iter->Next(); |
||||||
|
} |
||||||
|
|
||||||
|
if (!(*s).ok()) { |
||||||
|
return WriteBatchWithIndexInternal::Result::kError; |
||||||
|
} |
||||||
|
|
||||||
|
if (!iter->Valid()) { |
||||||
|
// Read past end of results. Reposition on last result.
|
||||||
|
iter->SeekToLast(); |
||||||
|
} else { |
||||||
|
iter->Prev(); |
||||||
|
} |
||||||
|
|
||||||
|
const Slice* entry_value = nullptr; |
||||||
|
while (iter->Valid()) { |
||||||
|
const WriteEntry& entry = iter->Entry(); |
||||||
|
if (cmp->CompareKey(cf_id, entry.key, key) != 0) { |
||||||
|
// Unexpected error or we've reached a different next key
|
||||||
|
break; |
||||||
|
} |
||||||
|
|
||||||
|
switch (entry.type) { |
||||||
|
case kPutRecord: { |
||||||
|
result = WriteBatchWithIndexInternal::Result::kFound; |
||||||
|
entry_value = &entry.value; |
||||||
|
break; |
||||||
|
} |
||||||
|
case kMergeRecord: { |
||||||
|
result = WriteBatchWithIndexInternal::Result::kMergeInProgress; |
||||||
|
merge_context->PushOperand(entry.value); |
||||||
|
break; |
||||||
|
} |
||||||
|
case kDeleteRecord: { |
||||||
|
result = WriteBatchWithIndexInternal::Result::kDeleted; |
||||||
|
break; |
||||||
|
} |
||||||
|
case kLogDataRecord: { |
||||||
|
// ignore
|
||||||
|
break; |
||||||
|
} |
||||||
|
default: { |
||||||
|
result = WriteBatchWithIndexInternal::Result::kError; |
||||||
|
(*s) = Status::Corruption("Unexpected entry in WriteBatchWithIndex:", |
||||||
|
std::to_string(entry.type)); |
||||||
|
break; |
||||||
|
} |
||||||
|
} |
||||||
|
if (result == WriteBatchWithIndexInternal::Result::kFound || |
||||||
|
result == WriteBatchWithIndexInternal::Result::kDeleted || |
||||||
|
result == WriteBatchWithIndexInternal::Result::kError) { |
||||||
|
// We can stop iterating once we find a PUT or DELETE
|
||||||
|
break; |
||||||
|
} |
||||||
|
|
||||||
|
iter->Prev(); |
||||||
|
} |
||||||
|
|
||||||
|
if ((*s).ok()) { |
||||||
|
if (result == WriteBatchWithIndexInternal::Result::kFound || |
||||||
|
result == WriteBatchWithIndexInternal::Result::kDeleted) { |
||||||
|
// Found a Put or Delete. Merge if necessary.
|
||||||
|
if (merge_context->GetNumOperands() > 0) { |
||||||
|
const MergeOperator* merge_operator; |
||||||
|
|
||||||
|
if (column_family != nullptr) { |
||||||
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family); |
||||||
|
merge_operator = cfh->cfd()->ioptions()->merge_operator; |
||||||
|
} else { |
||||||
|
*s = Status::InvalidArgument("Must provide a column_family"); |
||||||
|
result = WriteBatchWithIndexInternal::Result::kError; |
||||||
|
return result; |
||||||
|
} |
||||||
|
Statistics* statistics = options.statistics.get(); |
||||||
|
Env* env = options.env; |
||||||
|
Logger* logger = options.info_log.get(); |
||||||
|
|
||||||
|
*s = MergeHelper::TimedFullMerge( |
||||||
|
key, entry_value, merge_context->GetOperands(), merge_operator, |
||||||
|
statistics, env, logger, value); |
||||||
|
if ((*s).ok()) { |
||||||
|
result = WriteBatchWithIndexInternal::Result::kFound; |
||||||
|
} else { |
||||||
|
result = WriteBatchWithIndexInternal::Result::kError; |
||||||
|
} |
||||||
|
} else { // nothing to merge
|
||||||
|
if (result == WriteBatchWithIndexInternal::Result::kFound) { // PUT
|
||||||
|
value->assign(entry_value->data(), entry_value->size()); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return result; |
||||||
|
} |
||||||
|
|
||||||
|
} // namespace rocksdb
|
@ -0,0 +1,96 @@ |
|||||||
|
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
#pragma once |
||||||
|
|
||||||
|
#include <limits> |
||||||
|
#include <string> |
||||||
|
#include <unordered_map> |
||||||
|
|
||||||
|
#include "rocksdb/comparator.h" |
||||||
|
#include "rocksdb/iterator.h" |
||||||
|
#include "rocksdb/slice.h" |
||||||
|
#include "rocksdb/status.h" |
||||||
|
#include "rocksdb/utilities/write_batch_with_index.h" |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
|
||||||
|
class MergeContext; |
||||||
|
struct Options; |
||||||
|
|
||||||
|
// Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
|
||||||
|
struct WriteBatchIndexEntry { |
||||||
|
WriteBatchIndexEntry(size_t o, uint32_t c) |
||||||
|
: offset(o), column_family(c), search_key(nullptr) {} |
||||||
|
WriteBatchIndexEntry(const Slice* sk, uint32_t c) |
||||||
|
: offset(0), column_family(c), search_key(sk) {} |
||||||
|
|
||||||
|
// If this flag appears in the offset, it indicates a key that is smaller
|
||||||
|
// than any other entry for the same column family
|
||||||
|
static const size_t kFlagMin = std::numeric_limits<size_t>::max(); |
||||||
|
|
||||||
|
size_t offset; // offset of an entry in write batch's string buffer.
|
||||||
|
uint32_t column_family; // column family of the entry
|
||||||
|
const Slice* search_key; // if not null, instead of reading keys from
|
||||||
|
// write batch, use it to compare. This is used
|
||||||
|
// for lookup key.
|
||||||
|
}; |
||||||
|
|
||||||
|
class ReadableWriteBatch : public WriteBatch { |
||||||
|
public: |
||||||
|
explicit ReadableWriteBatch(size_t reserved_bytes = 0) |
||||||
|
: WriteBatch(reserved_bytes) {} |
||||||
|
// Retrieve some information from a write entry in the write batch, given
|
||||||
|
// the start offset of the write entry.
|
||||||
|
Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key, |
||||||
|
Slice* value, Slice* blob) const; |
||||||
|
}; |
||||||
|
|
||||||
|
class WriteBatchEntryComparator { |
||||||
|
public: |
||||||
|
WriteBatchEntryComparator(const Comparator* _default_comparator, |
||||||
|
const ReadableWriteBatch* write_batch) |
||||||
|
: default_comparator_(_default_comparator), write_batch_(write_batch) {} |
||||||
|
// Compare a and b. Return a negative value if a is less than b, 0 if they
|
||||||
|
// are equal, and a positive value if a is greater than b
|
||||||
|
int operator()(const WriteBatchIndexEntry* entry1, |
||||||
|
const WriteBatchIndexEntry* entry2) const; |
||||||
|
|
||||||
|
int CompareKey(uint32_t column_family, const Slice& key1, |
||||||
|
const Slice& key2) const; |
||||||
|
|
||||||
|
void SetComparatorForCF(uint32_t column_family_id, |
||||||
|
const Comparator* comparator) { |
||||||
|
cf_comparator_map_[column_family_id] = comparator; |
||||||
|
} |
||||||
|
|
||||||
|
const Comparator* default_comparator() { return default_comparator_; } |
||||||
|
|
||||||
|
private: |
||||||
|
const Comparator* default_comparator_; |
||||||
|
std::unordered_map<uint32_t, const Comparator*> cf_comparator_map_; |
||||||
|
const ReadableWriteBatch* write_batch_; |
||||||
|
}; |
||||||
|
|
||||||
|
class WriteBatchWithIndexInternal { |
||||||
|
public: |
||||||
|
enum Result { kFound, kDeleted, kNotFound, kMergeInProgress, kError }; |
||||||
|
|
||||||
|
// If batch contains a value for key, store it in *value and return kFound.
|
||||||
|
// If batch contains a deletion for key, return Deleted.
|
||||||
|
// If batch contains Merge operations as the most recent entry for a key,
|
||||||
|
// and the merge process does not stop (not reaching a value or delete),
|
||||||
|
// prepend the current merge operands to *operands,
|
||||||
|
// and return kMergeInProgress
|
||||||
|
// If batch does not contain this key, return kNotFound
|
||||||
|
// Else, return kError on error with error Status stored in *s.
|
||||||
|
static WriteBatchWithIndexInternal::Result GetFromBatch( |
||||||
|
const DBOptions& options, WriteBatchWithIndex* batch, |
||||||
|
ColumnFamilyHandle* column_family, const Slice& key, |
||||||
|
MergeContext* merge_context, WriteBatchEntryComparator* cmp, |
||||||
|
std::string* value, Status* s); |
||||||
|
}; |
||||||
|
|
||||||
|
} // namespace rocksdb
|
Loading…
Reference in new issue