@ -27,30 +27,129 @@ class SliceTransform;
// CompactionFilter allows an application to modify/delete a key-value during
// CompactionFilter allows an application to modify/delete a key-value during
// table file creation.
// table file creation.
//
//
// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
// Some general notes:
//
// * RocksDB snapshots do not guarantee to preserve the state of the DB in the
// presence of CompactionFilter. Data seen from a snapshot might disappear after
// a table file created with a `CompactionFilter` is installed. If you use
// snapshots, think twice about whether you want to use `CompactionFilter` and
// whether you are using it in a safe way.
//
// * If multithreaded compaction is being used *and* a single CompactionFilter
// instance was supplied via Options::compaction_filter, CompactionFilter
// methods may be called from different threads concurrently. The application
// must ensure that such calls are thread-safe. If the CompactionFilter was
// created by a factory, then it will only ever be used by a single thread that
// is doing the table file creation, and this call does not need to be
// thread-safe. However, multiple filters may be in existence and operating
// concurrently.
//
// * The key passed to the filtering methods includes the timestamp if
// user-defined timestamps are enabled.
//
// * Exceptions MUST NOT propagate out of overridden functions into RocksDB,
// because RocksDB is not exception-safe. This could cause undefined behavior
// because RocksDB is not exception-safe. This could cause undefined behavior
// including data loss, unreported corruption, deadlocks, and more.
// including data loss, unreported corruption, deadlocks, and more.
class CompactionFilter : public Customizable {
class CompactionFilter : public Customizable {
public :
public :
// Value type of the key-value passed to the compaction filter's FilterV2/V3
// methods.
enum ValueType {
enum ValueType {
// Plain key-value
kValue ,
kValue ,
// Merge operand
kMergeOperand ,
kMergeOperand ,
kBlobIndex , // used internally by BlobDB.
// Used internally by the old stacked BlobDB implementation; this value type
// is never passed to application code. Note that when using the new
// integrated BlobDB, values stored separately as blobs are retrieved and
// presented to FilterV2/V3 with the type kValue above.
kBlobIndex ,
// Wide-column entity
kWideColumnEntity ,
kWideColumnEntity ,
} ;
} ;
// Potential decisions that can be returned by the compaction filter's
// FilterV2/V3 and FilterBlobByKey methods. See decision-specific caveats and
// constraints below.
enum class Decision {
enum class Decision {
// Keep the current key-value as-is.
kKeep ,
kKeep ,
// Remove the current key-value. Note that the semantics of removal are
// dependent on the value type. If the current key-value is a plain
// key-value or a wide-column entity, it is converted to a tombstone
// (Delete), resulting in the deletion of any earlier versions of the key.
// If it is a merge operand, it is simply dropped. Note: if you are using
// a TransactionDB, it is not recommended to filter out merge operands.
// If a Merge operation is filtered out, TransactionDB may not realize there
// is a write conflict and may allow a Transaction that should have failed
// to Commit. Instead, it is better to implement any Merge filtering inside
// the MergeOperator.
kRemove ,
kRemove ,
// Change the value of the current key-value. If the current key-value is a
// plain key-value or a merge operand, its value is updated but its value
// type remains the same. If the current key-value is a wide-column entity,
// it is converted to a plain key-value with the new value specified.
kChangeValue ,
kChangeValue ,
// Remove all key-values with key in [key, *skip_until). This range of keys
// will be skipped in a way that potentially avoids some IO operations
// compared to removing the keys one by one. Note that removal in this case
// means dropping the key-value regardless of value type; in other words, in
// contrast with kRemove, plain values and entities are not converted to
// tombstones.
//
// *skip_until <= key is treated the same as Decision::kKeep (since the
// range [key, *skip_until) is empty).
//
// Caveats:
// * The keys are skipped even if there are snapshots containing them,
// i.e. values removed by kRemoveAndSkipUntil can disappear from a
// snapshot - beware if you're using TransactionDB or DB::GetSnapshot().
// * If value for a key was overwritten or merged into (multiple Put()s
// or Merge()s), and `CompactionFilter` skips this key with
// kRemoveAndSkipUntil, it's possible that it will remove only
// the new value, exposing the old value that was supposed to be
// overwritten.
// * Doesn't work with PlainTableFactory in prefix mode.
// * If you use kRemoveAndSkipUntil for table files created by compaction,
// consider also reducing compaction_readahead_size option.
kRemoveAndSkipUntil ,
kRemoveAndSkipUntil ,
kChangeBlobIndex , // used internally by BlobDB.
kIOError , // used internally by BlobDB.
// Used internally by the old stacked BlobDB implementation. Returning this
kPurge , // used for keys that can only be SingleDelete'ed
// decision from application code is not supported.
kChangeBlobIndex ,
// Used internally by the old stacked BlobDB implementation. Returning this
// decision from application code is not supported.
kIOError ,
// Remove the current key-value by converting it to a SingleDelete-type
// tombstone. Only supported for plain-key values and wide-column entities;
// not supported for merge operands. All the caveats related to
// SingleDeletes apply.
kPurge ,
// Change the current key-value to the wide-column entity specified. If the
// current key-value is already a wide-column entity, only its columns are
// updated; if it is a plain key-value, it is converted to a wide-column
// entity with the specified columns. Not supported for merge operands.
// Only applicable to FilterV3.
kChangeWideColumnEntity ,
kChangeWideColumnEntity ,
// When using the integrated BlobDB implementation, it may be possible for
// applications to make a filtering decision for a given blob based on
// the key only without actually reading the blob value, which saves some
// I/O; see the FilterBlobByKey method below. Returning kUndetermined from
// FilterBlobByKey signals that making a decision solely based on the
// key is not possible; in this case, RocksDB reads the blob value and
// passes the key-value to the regular filtering method. Only applicable to
// FilterBlobByKey; returning this value from FilterV2/V3 is not supported.
kUndetermined ,
kUndetermined ,
} ;
} ;
// Used internally by the old stacked BlobDB implementation.
enum class BlobDecision { kKeep , kChangeValue , kCorruption , kIOError } ;
enum class BlobDecision { kKeep , kChangeValue , kCorruption , kIOError } ;
// Context information for a table file creation.
// Context information for a table file creation.
@ -76,8 +175,8 @@ class CompactionFilter : public Customizable {
// The table file creation process invokes this method before adding a kv to
// The table file creation process invokes this method before adding a kv to
// the table file. A return value of false indicates that the kv should be
// the table file. A return value of false indicates that the kv should be
// preserved in the new table file and a return value of true indicates
// preserved in the new table file and a return value of true indicates
// that this key-value should be removed from the new table file. The
// that this key-value should be removed (that is, converted to a tombstone).
// application can inspect the existing value of the key and make decision
// The application can inspect the existing value of the key and make decision
// based on it.
// based on it.
//
//
// Key-Values that are results of merge operation during table file creation
// Key-Values that are results of merge operation during table file creation
@ -88,23 +187,6 @@ class CompactionFilter : public Customizable {
// When the value is to be preserved, the application has the option
// When the value is to be preserved, the application has the option
// to modify the existing_value and pass it back through new_value.
// to modify the existing_value and pass it back through new_value.
// value_changed needs to be set to true in this case.
// value_changed needs to be set to true in this case.
//
// Note that RocksDB snapshots (i.e. call GetSnapshot() API on a
// DB* object) will not guarantee to preserve the state of the DB with
// CompactionFilter. Data seen from a snapshot might disappear after a
// table file created with a `CompactionFilter` is installed. If you use
// snapshots, think twice about whether you want to use `CompactionFilter` and
// whether you are using it in a safe way.
//
// If multithreaded compaction is being used *and* a single CompactionFilter
// instance was supplied via Options::compaction_filter, this method may be
// called from different threads concurrently. The application must ensure
// that the call is thread-safe.
//
// If the CompactionFilter was created by a factory, then it will only ever
// be used by a single thread that is doing the table file creation, and this
// call does not need to be thread-safe. However, multiple filters may be
// in existence and operating concurrently.
virtual bool Filter ( int /*level*/ , const Slice & /*key*/ ,
virtual bool Filter ( int /*level*/ , const Slice & /*key*/ ,
const Slice & /*existing_value*/ ,
const Slice & /*existing_value*/ ,
std : : string * /*new_value*/ ,
std : : string * /*new_value*/ ,
@ -126,48 +208,18 @@ class CompactionFilter : public Customizable {
return false ;
return false ;
}
}
// An extended API. Called for both values and merge operands.
// A unified API for plain values and merge operands that may
// Allows changing value and skipping ranges of keys.
// return a variety of decisions (see Decision above). The `value_type`
// parameter indicates the type of the key-value and the `existing_value`
// contains the current value or merge operand. The `new_value` output
// parameter can be used to set the updated value or merge operand when the
// kChangeValue decision is made by the filter. See the description of
// kRemoveAndSkipUntil above for the semantics of the `skip_until` output
// parameter, and see Decision above for more information on the semantics of
// the potential return values.
//
// The default implementation uses Filter() and FilterMergeOperand().
// The default implementation uses Filter() and FilterMergeOperand().
// If you're overriding this method, no need to override the other two.
// If you're overriding this method, no need to override the other two.
// `value_type` indicates whether this key-value corresponds to a normal
// value (e.g. written with Put()) or a merge operand (written with Merge()).
//
// Possible return values:
// * kKeep - keep the key-value pair.
// * kRemove - remove the key-value pair or merge operand.
// * kChangeValue - keep the key and change the value/operand to *new_value.
// * kRemoveAndSkipUntil - remove this key-value pair, and also remove
// all key-value pairs with key in [key, *skip_until). This range
// of keys will be skipped without reading, potentially saving some
// IO operations compared to removing the keys one by one.
//
// *skip_until <= key is treated the same as Decision::kKeep
// (since the range [key, *skip_until) is empty).
//
// Caveats:
// - The keys are skipped even if there are snapshots containing them,
// i.e. values removed by kRemoveAndSkipUntil can disappear from a
// snapshot - beware if you're using TransactionDB or
// DB::GetSnapshot().
// - If value for a key was overwritten or merged into (multiple Put()s
// or Merge()s), and `CompactionFilter` skips this key with
// kRemoveAndSkipUntil, it's possible that it will remove only
// the new value, exposing the old value that was supposed to be
// overwritten.
// - Doesn't work with PlainTableFactory in prefix mode.
// - If you use kRemoveAndSkipUntil for table files created by
// compaction, consider also reducing compaction_readahead_size
// option.
//
// Should never return kUndetermined.
// Note: If you are using a TransactionDB, it is not recommended to filter
// out or modify merge operands (ValueType::kMergeOperand).
// If a merge operation is filtered out, TransactionDB may not realize there
// is a write conflict and may allow a Transaction to Commit that should have
// failed. Instead, it is better to implement any Merge filtering inside the
// MergeOperator.
// key includes timestamp if user-defined timestamp is enabled.
virtual Decision FilterV2 ( int level , const Slice & key , ValueType value_type ,
virtual Decision FilterV2 ( int level , const Slice & key , ValueType value_type ,
const Slice & existing_value , std : : string * new_value ,
const Slice & existing_value , std : : string * new_value ,
std : : string * /*skip_until*/ ) const {
std : : string * /*skip_until*/ ) const {
@ -195,17 +247,21 @@ class CompactionFilter : public Customizable {
}
}
}
}
// Wide column aware API. Called for plain values, merge operands, and
// Wide column aware unified API. Called for plain values, merge operands, and
// wide-column entities; the `value_type` parameter indicates the type of the
// wide-column entities; the `value_type` parameter indicates the type of the
// key-value. When the key-value is a plain value or a merge operand, the
// key-value. When the key-value is a plain value or a merge operand, the
// `existing_value` parameter contains the existing value and the
// `existing_value` parameter contains the existing value and the
// `existing_columns` parameter is invalid (nullptr). When the key-value is a
// `existing_columns` parameter is invalid (nullptr). When the key-value is a
// wide-column entity, the `existing_columns` parameter contains the wide
// wide-column entity, the `existing_columns` parameter contains the wide
// columns of the existing entity and the `existing_value` parameter is
// columns of the existing entity and the `existing_value` parameter is
// invalid (nullptr). The output parameters `new_value` and `new_columns` can
// invalid (nullptr). The `new_value` output parameter can be used to set the
// be used to change the value or wide columns of the key-value when
// updated value or merge operand when the kChangeValue decision is made by
// `kChangeValue` or `kChangeWideColumnEntity` is returned. See above for more
// the filter. The `new_columns` output parameter can be used to specify
// information on the semantics of the potential return values.
// the pairs of column names and column values when the
// kChangeWideColumnEntity decision is returned. See the description of
// kRemoveAndSkipUntil above for the semantics of the `skip_until` output
// parameter, and see Decision above for more information on the semantics of
// the potential return values.
//
//
// For compatibility, the default implementation keeps all wide-column
// For compatibility, the default implementation keeps all wide-column
// entities, and falls back to FilterV2 for plain values and merge operands.
// entities, and falls back to FilterV2 for plain values and merge operands.
@ -255,10 +311,15 @@ class CompactionFilter : public Customizable {
virtual bool IsStackedBlobDbInternalCompactionFilter ( ) const { return false ; }
virtual bool IsStackedBlobDbInternalCompactionFilter ( ) const { return false ; }
// In the case of BlobDB, it may be possible to reach a decision with only
// In the case of BlobDB, it may be possible to reach a decision with only
// the key without reading the actual value. Keys whose value_type is
// the key without reading the actual value, saving some I/O operations.
// kBlobIndex will be checked by this method.
// Keys where the value is stored separately in a blob file will be
// Returning kUndetermined will cause FilterV3() to be called to make a
// passed to this method. If the method returns a supported decision other
// decision as usual.
// than kUndetermined, it will be considered final and performed without
// reading the existing value. Returning kUndetermined will cause FilterV3()
// to be called to make a decision as usual. The output parameters
// `new_value` and `skip_until` are applicable to the decisions kChangeValue
// and kRemoveAndSkipUntil respectively, and have the same semantics as
// the corresponding parameters of FilterV2/V3.
virtual Decision FilterBlobByKey ( int /*level*/ , const Slice & /*key*/ ,
virtual Decision FilterBlobByKey ( int /*level*/ , const Slice & /*key*/ ,
std : : string * /*new_value*/ ,
std : : string * /*new_value*/ ,
std : : string * /*skip_until*/ ) const {
std : : string * /*skip_until*/ ) const {