|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <deque>
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "db/compaction.h"
|
|
|
|
#include "db/merge_helper.h"
|
|
|
|
#include "rocksdb/compaction_filter.h"
|
|
|
|
#include "util/log_buffer.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
struct CompactionIteratorStats {
|
|
|
|
// Compaction statistics
|
|
|
|
int64_t num_record_drop_user = 0;
|
|
|
|
int64_t num_record_drop_hidden = 0;
|
|
|
|
int64_t num_record_drop_obsolete = 0;
|
|
|
|
uint64_t total_filter_time = 0;
|
|
|
|
|
|
|
|
// Input statistics
|
|
|
|
// TODO(noetzli): The stats are incomplete. They are lacking everything
|
|
|
|
// consumed by MergeHelper.
|
|
|
|
uint64_t num_input_records = 0;
|
|
|
|
uint64_t num_input_deletion_records = 0;
|
|
|
|
uint64_t num_input_corrupt_records = 0;
|
|
|
|
uint64_t total_input_raw_key_bytes = 0;
|
|
|
|
uint64_t total_input_raw_value_bytes = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
class CompactionIterator {
|
|
|
|
public:
|
|
|
|
CompactionIterator(InternalIterator* input, const Comparator* cmp,
|
|
|
|
MergeHelper* merge_helper, SequenceNumber last_sequence,
|
Support marking snapshots for write-conflict checking
Summary:
D50475 enables using SST files for transaction write-conflict checking. In order for this to work, we need to make sure not to compact out SingleDeletes when there is an earlier transaction snapshot(D50295). If there is a long-held snapshot, this could reduce the benefit of the SingleDelete optimization.
This diff allows Transactions to mark snapshots as being used for write-conflict checking. Then, during compaction, we will be able to optimize SingleDeletes better in the future.
This diff adds a flag to SnapshotImpl which is used by Transactions. This diff also passes the earliest write-conflict snapshot's sequence number to CompactionIterator. This diff does not actually change Compaction (after this diff is pushed, D50295 will be able to use this information).
Test Plan: no behavior change, ran existing tests
Reviewers: rven, kradhakrishnan, yhchiang, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D51183
9 years ago
|
|
|
std::vector<SequenceNumber>* snapshots,
|
|
|
|
SequenceNumber earliest_write_conflict_snapshot, Env* env,
|
|
|
|
bool expect_valid_internal_key,
|
|
|
|
Compaction* compaction = nullptr,
|
|
|
|
const CompactionFilter* compaction_filter = nullptr,
|
|
|
|
LogBuffer* log_buffer = nullptr);
|
|
|
|
|
|
|
|
void ResetRecordCounts();
|
|
|
|
|
|
|
|
// Seek to the beginning of the compaction iterator output.
|
|
|
|
//
|
|
|
|
// REQUIRED: Call only once.
|
|
|
|
void SeekToFirst();
|
|
|
|
|
|
|
|
// Produces the next record in the compaction.
|
|
|
|
//
|
|
|
|
// REQUIRED: SeekToFirst() has been called.
|
|
|
|
void Next();
|
|
|
|
|
|
|
|
// Getters
|
|
|
|
const Slice& key() const { return key_; }
|
|
|
|
const Slice& value() const { return value_; }
|
|
|
|
const Status& status() const { return status_; }
|
|
|
|
const ParsedInternalKey& ikey() const { return ikey_; }
|
|
|
|
bool Valid() const { return valid_; }
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
const Slice& user_key() const { return current_user_key_; }
|
|
|
|
const CompactionIteratorStats& iter_stats() const { return iter_stats_; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
// Processes the input stream to find the next output
|
|
|
|
void NextFromInput();
|
|
|
|
|
|
|
|
// Do last preparations before presenting the output to the callee. At this
|
|
|
|
// point this only zeroes out the sequence number if possible for better
|
|
|
|
// compression.
|
|
|
|
void PrepareOutput();
|
|
|
|
|
|
|
|
// Given a sequence number, return the sequence number of the
|
|
|
|
// earliest snapshot that this sequence number is visible in.
|
|
|
|
// The snapshots themselves are arranged in ascending order of
|
|
|
|
// sequence numbers.
|
|
|
|
// Employ a sequential search because the total number of
|
|
|
|
// snapshots are typically small.
|
|
|
|
inline SequenceNumber findEarliestVisibleSnapshot(
|
|
|
|
SequenceNumber in, SequenceNumber* prev_snapshot);
|
|
|
|
|
|
|
|
InternalIterator* input_;
|
|
|
|
const Comparator* cmp_;
|
|
|
|
MergeHelper* merge_helper_;
|
|
|
|
const std::vector<SequenceNumber>* snapshots_;
|
Support marking snapshots for write-conflict checking
Summary:
D50475 enables using SST files for transaction write-conflict checking. In order for this to work, we need to make sure not to compact out SingleDeletes when there is an earlier transaction snapshot(D50295). If there is a long-held snapshot, this could reduce the benefit of the SingleDelete optimization.
This diff allows Transactions to mark snapshots as being used for write-conflict checking. Then, during compaction, we will be able to optimize SingleDeletes better in the future.
This diff adds a flag to SnapshotImpl which is used by Transactions. This diff also passes the earliest write-conflict snapshot's sequence number to CompactionIterator. This diff does not actually change Compaction (after this diff is pushed, D50295 will be able to use this information).
Test Plan: no behavior change, ran existing tests
Reviewers: rven, kradhakrishnan, yhchiang, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D51183
9 years ago
|
|
|
const SequenceNumber earliest_write_conflict_snapshot_;
|
|
|
|
Env* env_;
|
|
|
|
bool expect_valid_internal_key_;
|
|
|
|
Compaction* compaction_;
|
|
|
|
const CompactionFilter* compaction_filter_;
|
|
|
|
LogBuffer* log_buffer_;
|
|
|
|
bool bottommost_level_;
|
|
|
|
bool valid_ = false;
|
|
|
|
SequenceNumber visible_at_tip_;
|
|
|
|
SequenceNumber earliest_snapshot_;
|
|
|
|
SequenceNumber latest_snapshot_;
|
|
|
|
bool ignore_snapshots_;
|
|
|
|
|
|
|
|
// State
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
//
|
|
|
|
// Points to a copy of the current compaction iterator output (current_key_)
|
|
|
|
// if valid_.
|
|
|
|
Slice key_;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
// Points to the value in the underlying iterator that corresponds to the
|
|
|
|
// current output.
|
|
|
|
Slice value_;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
// The status is OK unless compaction iterator encounters a merge operand
|
|
|
|
// while not having a merge operator defined.
|
|
|
|
Status status_;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
// Stores the user key, sequence number and type of the current compaction
|
|
|
|
// iterator output (or current key in the underlying iterator during
|
|
|
|
// NextFromInput()).
|
|
|
|
ParsedInternalKey ikey_;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
// Stores whether ikey_.user_key is valid. If set to false, the user key is
|
|
|
|
// not compared against the current key in the underlying iterator.
|
|
|
|
bool has_current_user_key_ = false;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
9 years ago
|
|
|
bool at_next_ = false; // If false, the iterator
|
|
|
|
// Holds a copy of the current compaction iterator output (or current key in
|
|
|
|
// the underlying iterator during NextFromInput()).
|
|
|
|
IterKey current_key_;
|
|
|
|
Slice current_user_key_;
|
|
|
|
SequenceNumber current_user_key_sequence_;
|
|
|
|
SequenceNumber current_user_key_snapshot_;
|
|
|
|
MergeOutputIterator merge_out_iter_;
|
|
|
|
std::string compaction_filter_value_;
|
|
|
|
// "level_ptrs" holds indices that remember which file of an associated
|
|
|
|
// level we were last checking during the last call to compaction->
|
|
|
|
// KeyNotExistsBeyondOutputLevel(). This allows future calls to the function
|
|
|
|
// to pick off where it left off since each subcompaction's key range is
|
|
|
|
// increasing so a later call to the function must be looking for a key that
|
|
|
|
// is in or beyond the last file checked during the previous call
|
|
|
|
std::vector<size_t> level_ptrs_;
|
|
|
|
CompactionIteratorStats iter_stats_;
|
|
|
|
};
|
|
|
|
} // namespace rocksdb
|