Compaction filter support for BlobDB (#6850)

Summary:
Added compaction filter support for BlobDB non-TTL values. Same as vanilla RocksDB, user compaction filter applies to all k/v pairs of the compaction for non-TTL values. It honors `min_blob_size`, which potentially results value transitions between inlined data and stored-in-blob data when size of value is changed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6850

Reviewed By: siying

Differential Revision: D22263487

Pulled By: ltamasi

fbshipit-source-id: 8fc03f8cde2a5c831e63b436b3dbf1b7f90939e8
main
Burton Li 4 years ago committed by Facebook GitHub Bot
parent 58547e533b
commit 5be2cb6948
  1. 1
      HISTORY.md
  2. 29
      db/compaction/compaction_iterator.cc
  3. 3
      db/compaction/compaction_iterator.h
  4. 2
      include/rocksdb/compaction_filter.h
  5. 331
      utilities/blob_db/blob_compaction_filter.cc
  6. 105
      utilities/blob_db/blob_compaction_filter.h
  7. 75
      utilities/blob_db/blob_db_impl.cc
  8. 7
      utilities/blob_db/blob_db_impl.h
  9. 188
      utilities/blob_db/blob_db_test.cc
  10. 1
      utilities/blob_db/blob_file.h
  11. 37
      utilities/compaction_filters/layered_compaction_filter_base.h
  12. 32
      utilities/ttl/db_ttl_impl.h

@ -19,6 +19,7 @@
* DB identity (`db_id`) and DB session identity (`db_session_id`) are added to table properties and stored in SST files. SST files generated from SstFileWriter and Repairer have DB identity “SST Writer” and “DB Repairer”, respectively. Their DB session IDs are generated in the same way as `DB::GetDbSessionId`. The session ID for SstFileWriter (resp., Repairer) resets every time `SstFileWriter::Open` (resp., `Repairer::Run`) is called. * DB identity (`db_id`) and DB session identity (`db_session_id`) are added to table properties and stored in SST files. SST files generated from SstFileWriter and Repairer have DB identity “SST Writer” and “DB Repairer”, respectively. Their DB session IDs are generated in the same way as `DB::GetDbSessionId`. The session ID for SstFileWriter (resp., Repairer) resets every time `SstFileWriter::Open` (resp., `Repairer::Run`) is called.
* Added experimental option BlockBasedTableOptions::optimize_filters_for_memory for reducing allocated memory size of Bloom filters (~10% savings with Jemalloc) while preserving the same general accuracy. To have an effect, the option requires format_version=5 and malloc_usable_size. Enabling this option is forward and backward compatible with existing format_version=5. * Added experimental option BlockBasedTableOptions::optimize_filters_for_memory for reducing allocated memory size of Bloom filters (~10% savings with Jemalloc) while preserving the same general accuracy. To have an effect, the option requires format_version=5 and malloc_usable_size. Enabling this option is forward and backward compatible with existing format_version=5.
* `BackupableDBOptions::new_naming_for_backup_files` is added. This option is true by default. When it is true, backup table filenames are of the form `<file_number>_<crc32c>_<db_session_id>.sst` as opposed to `<file_number>_<crc32c>_<file_size>.sst`. When there is no `db_session_id` available in the table file, we use `file_size` as a fallback. Note that when this option is true, it comes into effect only when both `share_files_with_checksum` and `share_table_files` are true. * `BackupableDBOptions::new_naming_for_backup_files` is added. This option is true by default. When it is true, backup table filenames are of the form `<file_number>_<crc32c>_<db_session_id>.sst` as opposed to `<file_number>_<crc32c>_<file_size>.sst`. When there is no `db_session_id` available in the table file, we use `file_size` as a fallback. Note that when this option is true, it comes into effect only when both `share_files_with_checksum` and `share_table_files` are true.
* Added compaction filter support for BlobDB non-TTL values. Same as vanilla RocksDB, user compaction filter applies to all k/v pairs of the compaction for non-TTL values. It honors `min_blob_size`, which potentially results value transitions between inlined data and stored-in-blob data when size of value is changed.
### Bug Fixes ### Bug Fixes
* Fail recovery and report once hitting a physical log record checksum mismatch, while reading MANIFEST. RocksDB should not continue processing the MANIFEST any further. * Fail recovery and report once hitting a physical log record checksum mismatch, while reading MANIFEST. RocksDB should not continue processing the MANIFEST any further.

@ -182,7 +182,7 @@ void CompactionIterator::Next() {
PrepareOutput(); PrepareOutput();
} }
void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
Slice* skip_until) { Slice* skip_until) {
if (compaction_filter_ != nullptr && if (compaction_filter_ != nullptr &&
(ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex)) { (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex)) {
@ -225,15 +225,32 @@ void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
value_.clear(); value_.clear();
iter_stats_.num_record_drop_user++; iter_stats_.num_record_drop_user++;
} else if (filter == CompactionFilter::Decision::kChangeValue) { } else if (filter == CompactionFilter::Decision::kChangeValue) {
if (ikey_.type == kTypeBlobIndex) {
// value transfer from blob file to inlined data
ikey_.type = kTypeValue;
current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
}
value_ = compaction_filter_value_; value_ = compaction_filter_value_;
} else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) { } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
*need_skip = true; *need_skip = true;
compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
kValueTypeForSeek); kValueTypeForSeek);
*skip_until = compaction_filter_skip_until_.Encode(); *skip_until = compaction_filter_skip_until_.Encode();
} else if (filter == CompactionFilter::Decision::kChangeBlobIndex) {
if (ikey_.type == kTypeValue) {
// value transfer from inlined data to blob file
ikey_.type = kTypeBlobIndex;
current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
} }
value_ = compaction_filter_value_;
} else if (filter == CompactionFilter::Decision::kIOError) {
status_ =
Status::IOError("Failed to access blob during compaction filter");
return false;
} }
} }
return true;
}
void CompactionIterator::NextFromInput() { void CompactionIterator::NextFromInput() {
at_next_ = false; at_next_ = false;
@ -295,8 +312,9 @@ void CompactionIterator::NextFromInput() {
// Apply the compaction filter to the first committed version of the user // Apply the compaction filter to the first committed version of the user
// key. // key.
if (current_key_committed_) { if (current_key_committed_ &&
InvokeFilterIfNeeded(&need_skip, &skip_until); !InvokeFilterIfNeeded(&need_skip, &skip_until)) {
break;
} }
} else { } else {
// Update the current key to reflect the new sequence number/type without // Update the current key to reflect the new sequence number/type without
@ -316,8 +334,9 @@ void CompactionIterator::NextFromInput() {
current_key_committed_ = KeyCommitted(ikey_.sequence); current_key_committed_ = KeyCommitted(ikey_.sequence);
// Apply the compaction filter to the first committed version of the // Apply the compaction filter to the first committed version of the
// user key. // user key.
if (current_key_committed_) { if (current_key_committed_ &&
InvokeFilterIfNeeded(&need_skip, &skip_until); !InvokeFilterIfNeeded(&need_skip, &skip_until)) {
break;
} }
} }
} }

@ -121,7 +121,8 @@ class CompactionIterator {
void PrepareOutput(); void PrepareOutput();
// Invoke compaction filter if needed. // Invoke compaction filter if needed.
void InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until); // Return true on success, false on failures (e.g.: kIOError).
bool InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until);
// Given a sequence number, return the sequence number of the // Given a sequence number, return the sequence number of the
// earliest snapshot that this sequence number is visible in. // earliest snapshot that this sequence number is visible in.

@ -45,6 +45,8 @@ class CompactionFilter {
kRemove, kRemove,
kChangeValue, kChangeValue,
kRemoveAndSkipUntil, kRemoveAndSkipUntil,
kChangeBlobIndex, // used internally by BlobDB.
kIOError, // used internally by BlobDB.
}; };
enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError }; enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError };

@ -6,19 +6,41 @@
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
#include "utilities/blob_db/blob_compaction_filter.h" #include "utilities/blob_db/blob_compaction_filter.h"
#include "db/dbformat.h"
#include <cinttypes> #include <cinttypes>
#include "db/dbformat.h"
#include "test_util/sync_point.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
namespace blob_db { namespace blob_db {
BlobIndexCompactionFilterBase::~BlobIndexCompactionFilterBase() {
if (blob_file_) {
CloseAndRegisterNewBlobFile();
}
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, expired_count_);
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, expired_size_);
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_COUNT, evicted_count_);
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_SIZE, evicted_size_);
}
CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2( CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
int /*level*/, const Slice& key, ValueType value_type, const Slice& value, int level, const Slice& key, ValueType value_type, const Slice& value,
std::string* /*new_value*/, std::string* /*skip_until*/) const { std::string* new_value, std::string* skip_until) const {
const CompactionFilter* ucf = user_comp_filter();
if (value_type != kBlobIndex) { if (value_type != kBlobIndex) {
if (ucf == nullptr) {
return Decision::kKeep; return Decision::kKeep;
} }
// Apply user compaction filter for inlined data.
CompactionFilter::Decision decision =
ucf->FilterV2(level, key, value_type, value, new_value, skip_until);
if (decision == Decision::kChangeValue) {
return HandleValueChange(key, new_value);
}
return decision;
}
BlobIndex blob_index; BlobIndex blob_index;
Status s = blob_index.DecodeFrom(value); Status s = blob_index.DecodeFrom(value);
if (!s.ok()) { if (!s.ok()) {
@ -44,26 +66,82 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
// Hack: Internal key is passed to BlobIndexCompactionFilter for it to // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
// get sequence number. // get sequence number.
ParsedInternalKey ikey; ParsedInternalKey ikey;
bool ok = ParseInternalKey(key, &ikey); if (!ParseInternalKey(key, &ikey)) {
assert(false);
return Decision::kKeep;
}
// Remove keys that could have been remove by last FIFO eviction. // Remove keys that could have been remove by last FIFO eviction.
// If get error while parsing key, ignore and continue. // If get error while parsing key, ignore and continue.
if (ok && ikey.sequence < context_.fifo_eviction_seq) { if (ikey.sequence < context_.fifo_eviction_seq) {
evicted_count_++; evicted_count_++;
evicted_size_ += key.size() + value.size(); evicted_size_ += key.size() + value.size();
return Decision::kRemove; return Decision::kRemove;
} }
} }
// Apply user compaction filter for all non-TTL blob data.
if (ucf != nullptr && !blob_index.HasTTL()) {
// Hack: Internal key is passed to BlobIndexCompactionFilter for it to
// get sequence number.
ParsedInternalKey ikey;
if (!ParseInternalKey(key, &ikey)) {
assert(false);
return Decision::kKeep;
}
// Read value from blob file.
PinnableSlice blob;
CompressionType compression_type = kNoCompression;
constexpr bool need_decompress = true;
if (!ReadBlobFromOldFile(ikey.user_key, blob_index, &blob, need_decompress,
&compression_type)) {
return Decision::kIOError;
}
CompactionFilter::Decision decision = ucf->FilterV2(
level, ikey.user_key, kValue, blob, new_value, skip_until);
if (decision == Decision::kChangeValue) {
return HandleValueChange(ikey.user_key, new_value);
}
return decision;
}
return Decision::kKeep; return Decision::kKeep;
} }
BlobIndexCompactionFilterGC::~BlobIndexCompactionFilterGC() { CompactionFilter::Decision BlobIndexCompactionFilterBase::HandleValueChange(
if (blob_file_) { const Slice& key, std::string* new_value) const {
CloseAndRegisterNewBlobFile(); BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
assert(blob_db_impl);
if (new_value->size() < blob_db_impl->bdb_options_.min_blob_size) {
// Keep new_value inlined.
return Decision::kChangeValue;
}
if (!OpenNewBlobFileIfNeeded()) {
return Decision::kIOError;
}
Slice new_blob_value(*new_value);
std::string compression_output;
if (blob_db_impl->bdb_options_.compression != kNoCompression) {
new_blob_value =
blob_db_impl->GetCompressedSlice(new_blob_value, &compression_output);
}
uint64_t new_blob_file_number = 0;
uint64_t new_blob_offset = 0;
if (!WriteBlobToNewFile(key, new_blob_value, &new_blob_file_number,
&new_blob_offset)) {
return Decision::kIOError;
}
if (!CloseAndRegisterNewBlobFileIfNeeded()) {
return Decision::kIOError;
}
BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
new_blob_value.size(),
blob_db_impl->bdb_options_.compression);
return Decision::kChangeBlobIndex;
} }
assert(context_gc_.blob_db_impl); BlobIndexCompactionFilterGC::~BlobIndexCompactionFilterGC() {
assert(context().blob_db_impl);
ROCKS_LOG_INFO(context_gc_.blob_db_impl->db_options_.info_log, ROCKS_LOG_INFO(context().blob_db_impl->db_options_.info_log,
"GC pass finished %s: encountered %" PRIu64 " blobs (%" PRIu64 "GC pass finished %s: encountered %" PRIu64 " blobs (%" PRIu64
" bytes), relocated %" PRIu64 " blobs (%" PRIu64 " bytes), relocated %" PRIu64 " blobs (%" PRIu64
" bytes), created %" PRIu64 " new blob file(s)", " bytes), created %" PRIu64 " new blob file(s)",
@ -80,83 +158,20 @@ BlobIndexCompactionFilterGC::~BlobIndexCompactionFilterGC() {
RecordTick(statistics(), BLOB_DB_GC_FAILURES, gc_stats_.HasError()); RecordTick(statistics(), BLOB_DB_GC_FAILURES, gc_stats_.HasError());
} }
CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput( bool BlobIndexCompactionFilterBase::IsBlobFileOpened() const {
const Slice& key, const Slice& existing_value, if (blob_file_) {
std::string* new_value) const { assert(writer_);
assert(new_value); return true;
const BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl;
(void)blob_db_impl;
assert(blob_db_impl);
assert(blob_db_impl->bdb_options_.enable_garbage_collection);
BlobIndex blob_index;
const Status s = blob_index.DecodeFrom(existing_value);
if (!s.ok()) {
gc_stats_.SetError();
return BlobDecision::kCorruption;
}
if (blob_index.IsInlined()) {
gc_stats_.AddBlob(blob_index.value().size());
return BlobDecision::kKeep;
}
gc_stats_.AddBlob(blob_index.size());
if (blob_index.HasTTL()) {
return BlobDecision::kKeep;
}
if (blob_index.file_number() >= context_gc_.cutoff_file_number) {
return BlobDecision::kKeep;
}
// Note: each compaction generates its own blob files, which, depending on the
// workload, might result in many small blob files. The total number of files
// is bounded though (determined by the number of compactions and the blob
// file size option).
if (!OpenNewBlobFileIfNeeded()) {
gc_stats_.SetError();
return BlobDecision::kIOError;
}
PinnableSlice blob;
CompressionType compression_type = kNoCompression;
if (!ReadBlobFromOldFile(key, blob_index, &blob, &compression_type)) {
gc_stats_.SetError();
return BlobDecision::kIOError;
}
uint64_t new_blob_file_number = 0;
uint64_t new_blob_offset = 0;
if (!WriteBlobToNewFile(key, blob, &new_blob_file_number, &new_blob_offset)) {
gc_stats_.SetError();
return BlobDecision::kIOError;
}
if (!CloseAndRegisterNewBlobFileIfNeeded()) {
gc_stats_.SetError();
return BlobDecision::kIOError;
} }
return false;
BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
blob.size(), compression_type);
gc_stats_.AddRelocatedBlob(blob_index.size());
return BlobDecision::kChangeValue;
} }
bool BlobIndexCompactionFilterGC::OpenNewBlobFileIfNeeded() const { bool BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded() const {
if (blob_file_) { if (IsBlobFileOpened()) {
assert(writer_);
return true; return true;
} }
BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl; BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
assert(blob_db_impl); assert(blob_db_impl);
const Status s = blob_db_impl->CreateBlobFileAndWriter( const Status s = blob_db_impl->CreateBlobFileAndWriter(
@ -165,25 +180,24 @@ bool BlobIndexCompactionFilterGC::OpenNewBlobFileIfNeeded() const {
ROCKS_LOG_ERROR(blob_db_impl->db_options_.info_log, ROCKS_LOG_ERROR(blob_db_impl->db_options_.info_log,
"Error opening new blob file during GC, status: %s", "Error opening new blob file during GC, status: %s",
s.ToString().c_str()); s.ToString().c_str());
blob_file_.reset();
writer_.reset();
return false; return false;
} }
assert(blob_file_); assert(blob_file_);
assert(writer_); assert(writer_);
gc_stats_.AddNewFile();
return true; return true;
} }
bool BlobIndexCompactionFilterGC::ReadBlobFromOldFile( bool BlobIndexCompactionFilterBase::ReadBlobFromOldFile(
const Slice& key, const BlobIndex& blob_index, PinnableSlice* blob, const Slice& key, const BlobIndex& blob_index, PinnableSlice* blob,
CompressionType* compression_type) const { bool need_decompress, CompressionType* compression_type) const {
BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl; BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
assert(blob_db_impl); assert(blob_db_impl);
const Status s = blob_db_impl->GetRawBlobFromFile( Status s = blob_db_impl->GetRawBlobFromFile(
key, blob_index.file_number(), blob_index.offset(), blob_index.size(), key, blob_index.file_number(), blob_index.offset(), blob_index.size(),
blob, compression_type); blob, compression_type);
@ -197,12 +211,28 @@ bool BlobIndexCompactionFilterGC::ReadBlobFromOldFile(
return false; return false;
} }
if (need_decompress && *compression_type != kNoCompression) {
s = blob_db_impl->DecompressSlice(*blob, *compression_type, blob);
if (!s.ok()) {
ROCKS_LOG_ERROR(
blob_db_impl->db_options_.info_log,
"Uncompression error during blob read from file: %" PRIu64
" blob_offset: %" PRIu64 " blob_size: %" PRIu64
" key: %s status: '%s'",
blob_index.file_number(), blob_index.offset(), blob_index.size(),
key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
return false;
}
}
return true; return true;
} }
bool BlobIndexCompactionFilterGC::WriteBlobToNewFile( bool BlobIndexCompactionFilterBase::WriteBlobToNewFile(
const Slice& key, const Slice& blob, uint64_t* new_blob_file_number, const Slice& key, const Slice& blob, uint64_t* new_blob_file_number,
uint64_t* new_blob_offset) const { uint64_t* new_blob_offset) const {
TEST_SYNC_POINT("BlobIndexCompactionFilterBase::WriteBlobToNewFile");
assert(new_blob_file_number); assert(new_blob_file_number);
assert(new_blob_offset); assert(new_blob_offset);
@ -215,7 +245,7 @@ bool BlobIndexCompactionFilterGC::WriteBlobToNewFile(
new_blob_offset); new_blob_offset);
if (!s.ok()) { if (!s.ok()) {
const BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl; const BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
assert(blob_db_impl); assert(blob_db_impl);
ROCKS_LOG_ERROR( ROCKS_LOG_ERROR(
@ -230,7 +260,7 @@ bool BlobIndexCompactionFilterGC::WriteBlobToNewFile(
BlobLogRecord::kHeaderSize + key.size() + blob.size(); BlobLogRecord::kHeaderSize + key.size() + blob.size();
blob_file_->BlobRecordAdded(new_size); blob_file_->BlobRecordAdded(new_size);
BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl; BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
assert(blob_db_impl); assert(blob_db_impl);
blob_db_impl->total_blob_size_ += new_size; blob_db_impl->total_blob_size_ += new_size;
@ -238,8 +268,9 @@ bool BlobIndexCompactionFilterGC::WriteBlobToNewFile(
return true; return true;
} }
bool BlobIndexCompactionFilterGC::CloseAndRegisterNewBlobFileIfNeeded() const { bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFileIfNeeded()
const BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl; const {
const BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
assert(blob_db_impl); assert(blob_db_impl);
assert(blob_file_); assert(blob_file_);
@ -250,8 +281,8 @@ bool BlobIndexCompactionFilterGC::CloseAndRegisterNewBlobFileIfNeeded() const {
return CloseAndRegisterNewBlobFile(); return CloseAndRegisterNewBlobFile();
} }
bool BlobIndexCompactionFilterGC::CloseAndRegisterNewBlobFile() const { bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFile() const {
BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl; BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
assert(blob_db_impl); assert(blob_db_impl);
assert(blob_file_); assert(blob_file_);
@ -268,22 +299,112 @@ bool BlobIndexCompactionFilterGC::CloseAndRegisterNewBlobFile() const {
} }
assert(blob_file_->Immutable()); assert(blob_file_->Immutable());
blob_file_.reset();
if (!s.ok()) { if (!s.ok()) {
ROCKS_LOG_ERROR(blob_db_impl->db_options_.info_log, ROCKS_LOG_ERROR(blob_db_impl->db_options_.info_log,
"Error closing new blob file %s during GC, status: %s", "Error closing new blob file %s during GC, status: %s",
blob_file_->PathName().c_str(), s.ToString().c_str()); blob_file_->PathName().c_str(), s.ToString().c_str());
}
return false; blob_file_.reset();
return s.ok();
}
CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
const Slice& key, const Slice& existing_value,
std::string* new_value) const {
assert(new_value);
const BlobDBImpl* const blob_db_impl = context().blob_db_impl;
(void)blob_db_impl;
assert(blob_db_impl);
assert(blob_db_impl->bdb_options_.enable_garbage_collection);
BlobIndex blob_index;
const Status s = blob_index.DecodeFrom(existing_value);
if (!s.ok()) {
gc_stats_.SetError();
return BlobDecision::kCorruption;
}
if (blob_index.IsInlined()) {
gc_stats_.AddBlob(blob_index.value().size());
return BlobDecision::kKeep;
}
gc_stats_.AddBlob(blob_index.size());
if (blob_index.HasTTL()) {
return BlobDecision::kKeep;
}
if (blob_index.file_number() >= context_gc_.cutoff_file_number) {
return BlobDecision::kKeep;
}
// Note: each compaction generates its own blob files, which, depending on the
// workload, might result in many small blob files. The total number of files
// is bounded though (determined by the number of compactions and the blob
// file size option).
if (!OpenNewBlobFileIfNeeded()) {
gc_stats_.SetError();
return BlobDecision::kIOError;
} }
PinnableSlice blob;
CompressionType compression_type = kNoCompression;
if (!ReadBlobFromOldFile(key, blob_index, &blob, false, &compression_type)) {
gc_stats_.SetError();
return BlobDecision::kIOError;
}
uint64_t new_blob_file_number = 0;
uint64_t new_blob_offset = 0;
if (!WriteBlobToNewFile(key, blob, &new_blob_file_number, &new_blob_offset)) {
gc_stats_.SetError();
return BlobDecision::kIOError;
}
if (!CloseAndRegisterNewBlobFileIfNeeded()) {
gc_stats_.SetError();
return BlobDecision::kIOError;
}
BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
blob.size(), compression_type);
gc_stats_.AddRelocatedBlob(blob_index.size());
return BlobDecision::kChangeValue;
}
bool BlobIndexCompactionFilterGC::OpenNewBlobFileIfNeeded() const {
if (IsBlobFileOpened()) {
return true; return true;
} }
bool result = BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded();
if (result) {
gc_stats_.AddNewFile();
}
return result;
}
std::unique_ptr<CompactionFilter>
BlobIndexCompactionFilterFactoryBase::CreateUserCompactionFilterFromFactory(
const CompactionFilter::Context& context) const {
std::unique_ptr<CompactionFilter> user_comp_filter_from_factory;
if (user_comp_filter_factory_) {
user_comp_filter_from_factory =
user_comp_filter_factory_->CreateCompactionFilter(context);
}
return user_comp_filter_from_factory;
}
std::unique_ptr<CompactionFilter> std::unique_ptr<CompactionFilter>
BlobIndexCompactionFilterFactory::CreateCompactionFilter( BlobIndexCompactionFilterFactory::CreateCompactionFilter(
const CompactionFilter::Context& /*context*/) { const CompactionFilter::Context& _context) {
assert(env()); assert(env());
int64_t current_time = 0; int64_t current_time = 0;
@ -298,13 +419,17 @@ BlobIndexCompactionFilterFactory::CreateCompactionFilter(
BlobCompactionContext context; BlobCompactionContext context;
blob_db_impl()->GetCompactionContext(&context); blob_db_impl()->GetCompactionContext(&context);
std::unique_ptr<CompactionFilter> user_comp_filter_from_factory =
CreateUserCompactionFilterFromFactory(_context);
return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilter( return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilter(
std::move(context), current_time, statistics())); std::move(context), user_comp_filter(),
std::move(user_comp_filter_from_factory), current_time, statistics()));
} }
std::unique_ptr<CompactionFilter> std::unique_ptr<CompactionFilter>
BlobIndexCompactionFilterFactoryGC::CreateCompactionFilter( BlobIndexCompactionFilterFactoryGC::CreateCompactionFilter(
const CompactionFilter::Context& /*context*/) { const CompactionFilter::Context& _context) {
assert(env()); assert(env());
int64_t current_time = 0; int64_t current_time = 0;
@ -320,8 +445,12 @@ BlobIndexCompactionFilterFactoryGC::CreateCompactionFilter(
BlobCompactionContextGC context_gc; BlobCompactionContextGC context_gc;
blob_db_impl()->GetCompactionContext(&context, &context_gc); blob_db_impl()->GetCompactionContext(&context, &context_gc);
std::unique_ptr<CompactionFilter> user_comp_filter_from_factory =
CreateUserCompactionFilterFromFactory(_context);
return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilterGC( return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilterGC(
std::move(context), std::move(context_gc), current_time, statistics())); std::move(context), std::move(context_gc), user_comp_filter(),
std::move(user_comp_filter_from_factory), current_time, statistics()));
} }
} // namespace blob_db } // namespace blob_db

@ -13,11 +13,13 @@
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "utilities/blob_db/blob_db_gc_stats.h" #include "utilities/blob_db/blob_db_gc_stats.h"
#include "utilities/blob_db/blob_db_impl.h" #include "utilities/blob_db/blob_db_impl.h"
#include "utilities/compaction_filters/layered_compaction_filter_base.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
namespace blob_db { namespace blob_db {
struct BlobCompactionContext { struct BlobCompactionContext {
BlobDBImpl* blob_db_impl = nullptr;
uint64_t next_file_number = 0; uint64_t next_file_number = 0;
std::unordered_set<uint64_t> current_blob_files; std::unordered_set<uint64_t> current_blob_files;
SequenceNumber fifo_eviction_seq = 0; SequenceNumber fifo_eviction_seq = 0;
@ -25,41 +27,59 @@ struct BlobCompactionContext {
}; };
struct BlobCompactionContextGC { struct BlobCompactionContextGC {
BlobDBImpl* blob_db_impl = nullptr;
uint64_t cutoff_file_number = 0; uint64_t cutoff_file_number = 0;
}; };
// Compaction filter that deletes expired blob indexes from the base DB. // Compaction filter that deletes expired blob indexes from the base DB.
// Comes into two varieties, one for the non-GC case and one for the GC case. // Comes into two varieties, one for the non-GC case and one for the GC case.
class BlobIndexCompactionFilterBase : public CompactionFilter { class BlobIndexCompactionFilterBase : public LayeredCompactionFilterBase {
public: public:
BlobIndexCompactionFilterBase(BlobCompactionContext&& context, BlobIndexCompactionFilterBase(
BlobCompactionContext&& _context,
const CompactionFilter* _user_comp_filter,
std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory,
uint64_t current_time, Statistics* stats) uint64_t current_time, Statistics* stats)
: context_(std::move(context)), : LayeredCompactionFilterBase(_user_comp_filter,
std::move(_user_comp_filter_from_factory)),
context_(std::move(_context)),
current_time_(current_time), current_time_(current_time),
statistics_(stats) {} statistics_(stats) {}
~BlobIndexCompactionFilterBase() override { ~BlobIndexCompactionFilterBase() override;
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, expired_count_);
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, expired_size_);
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_COUNT, evicted_count_);
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_SIZE, evicted_size_);
}
// Filter expired blob indexes regardless of snapshots. // Filter expired blob indexes regardless of snapshots.
bool IgnoreSnapshots() const override { return true; } bool IgnoreSnapshots() const override { return true; }
Decision FilterV2(int /*level*/, const Slice& key, ValueType value_type, Decision FilterV2(int level, const Slice& key, ValueType value_type,
const Slice& value, std::string* /*new_value*/, const Slice& value, std::string* new_value,
std::string* /*skip_until*/) const override; std::string* skip_until) const override;
protected: protected:
bool IsBlobFileOpened() const;
virtual bool OpenNewBlobFileIfNeeded() const;
bool ReadBlobFromOldFile(const Slice& key, const BlobIndex& blob_index,
PinnableSlice* blob, bool need_decompress,
CompressionType* compression_type) const;
bool WriteBlobToNewFile(const Slice& key, const Slice& blob,
uint64_t* new_blob_file_number,
uint64_t* new_blob_offset) const;
bool CloseAndRegisterNewBlobFileIfNeeded() const;
bool CloseAndRegisterNewBlobFile() const;
Statistics* statistics() const { return statistics_; } Statistics* statistics() const { return statistics_; }
const BlobCompactionContext& context() const { return context_; }
private:
Decision HandleValueChange(const Slice& key, std::string* new_value) const;
private: private:
BlobCompactionContext context_; BlobCompactionContext context_;
const uint64_t current_time_; const uint64_t current_time_;
Statistics* statistics_; Statistics* statistics_;
mutable std::shared_ptr<BlobFile> blob_file_;
mutable std::shared_ptr<Writer> writer_;
// It is safe to not using std::atomic since the compaction filter, created // It is safe to not using std::atomic since the compaction filter, created
// from a compaction filter factroy, will not be called from multiple threads. // from a compaction filter factroy, will not be called from multiple threads.
mutable uint64_t expired_count_ = 0; mutable uint64_t expired_count_ = 0;
@ -70,20 +90,28 @@ class BlobIndexCompactionFilterBase : public CompactionFilter {
class BlobIndexCompactionFilter : public BlobIndexCompactionFilterBase { class BlobIndexCompactionFilter : public BlobIndexCompactionFilterBase {
public: public:
BlobIndexCompactionFilter(BlobCompactionContext&& context, BlobIndexCompactionFilter(
BlobCompactionContext&& _context,
const CompactionFilter* _user_comp_filter,
std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory,
uint64_t current_time, Statistics* stats) uint64_t current_time, Statistics* stats)
: BlobIndexCompactionFilterBase(std::move(context), current_time, stats) { : BlobIndexCompactionFilterBase(std::move(_context), _user_comp_filter,
} std::move(_user_comp_filter_from_factory),
current_time, stats) {}
const char* Name() const override { return "BlobIndexCompactionFilter"; } const char* Name() const override { return "BlobIndexCompactionFilter"; }
}; };
class BlobIndexCompactionFilterGC : public BlobIndexCompactionFilterBase { class BlobIndexCompactionFilterGC : public BlobIndexCompactionFilterBase {
public: public:
BlobIndexCompactionFilterGC(BlobCompactionContext&& context, BlobIndexCompactionFilterGC(
BlobCompactionContextGC&& context_gc, BlobCompactionContext&& _context, BlobCompactionContextGC&& context_gc,
const CompactionFilter* _user_comp_filter,
std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory,
uint64_t current_time, Statistics* stats) uint64_t current_time, Statistics* stats)
: BlobIndexCompactionFilterBase(std::move(context), current_time, stats), : BlobIndexCompactionFilterBase(std::move(_context), _user_comp_filter,
std::move(_user_comp_filter_from_factory),
current_time, stats),
context_gc_(std::move(context_gc)) {} context_gc_(std::move(context_gc)) {}
~BlobIndexCompactionFilterGC() override; ~BlobIndexCompactionFilterGC() override;
@ -94,20 +122,10 @@ class BlobIndexCompactionFilterGC : public BlobIndexCompactionFilterBase {
std::string* new_value) const override; std::string* new_value) const override;
private: private:
bool OpenNewBlobFileIfNeeded() const; bool OpenNewBlobFileIfNeeded() const override;
bool ReadBlobFromOldFile(const Slice& key, const BlobIndex& blob_index,
PinnableSlice* blob,
CompressionType* compression_type) const;
bool WriteBlobToNewFile(const Slice& key, const Slice& blob,
uint64_t* new_blob_file_number,
uint64_t* new_blob_offset) const;
bool CloseAndRegisterNewBlobFileIfNeeded() const;
bool CloseAndRegisterNewBlobFile() const;
private: private:
BlobCompactionContextGC context_gc_; BlobCompactionContextGC context_gc_;
mutable std::shared_ptr<BlobFile> blob_file_;
mutable std::shared_ptr<Writer> writer_;
mutable BlobDBGarbageCollectionStats gc_stats_; mutable BlobDBGarbageCollectionStats gc_stats_;
}; };
@ -117,50 +135,63 @@ class BlobIndexCompactionFilterGC : public BlobIndexCompactionFilterBase {
class BlobIndexCompactionFilterFactoryBase : public CompactionFilterFactory { class BlobIndexCompactionFilterFactoryBase : public CompactionFilterFactory {
public: public:
BlobIndexCompactionFilterFactoryBase(BlobDBImpl* _blob_db_impl, Env* _env, BlobIndexCompactionFilterFactoryBase(BlobDBImpl* _blob_db_impl, Env* _env,
const ColumnFamilyOptions& _cf_options,
Statistics* _statistics) Statistics* _statistics)
: blob_db_impl_(_blob_db_impl), env_(_env), statistics_(_statistics) {} : blob_db_impl_(_blob_db_impl),
env_(_env),
statistics_(_statistics),
user_comp_filter_(_cf_options.compaction_filter),
user_comp_filter_factory_(_cf_options.compaction_filter_factory) {}
protected: protected:
std::unique_ptr<CompactionFilter> CreateUserCompactionFilterFromFactory(
const CompactionFilter::Context& context) const;
BlobDBImpl* blob_db_impl() const { return blob_db_impl_; } BlobDBImpl* blob_db_impl() const { return blob_db_impl_; }
Env* env() const { return env_; } Env* env() const { return env_; }
Statistics* statistics() const { return statistics_; } Statistics* statistics() const { return statistics_; }
const CompactionFilter* user_comp_filter() const { return user_comp_filter_; }
private: private:
BlobDBImpl* blob_db_impl_; BlobDBImpl* blob_db_impl_;
Env* env_; Env* env_;
Statistics* statistics_; Statistics* statistics_;
const CompactionFilter* user_comp_filter_;
std::shared_ptr<CompactionFilterFactory> user_comp_filter_factory_;
}; };
class BlobIndexCompactionFilterFactory class BlobIndexCompactionFilterFactory
: public BlobIndexCompactionFilterFactoryBase { : public BlobIndexCompactionFilterFactoryBase {
public: public:
BlobIndexCompactionFilterFactory(BlobDBImpl* _blob_db_impl, Env* _env, BlobIndexCompactionFilterFactory(BlobDBImpl* _blob_db_impl, Env* _env,
const ColumnFamilyOptions& _cf_options,
Statistics* _statistics) Statistics* _statistics)
: BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _env, _statistics) { : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _env, _cf_options,
} _statistics) {}
const char* Name() const override { const char* Name() const override {
return "BlobIndexCompactionFilterFactory"; return "BlobIndexCompactionFilterFactory";
} }
std::unique_ptr<CompactionFilter> CreateCompactionFilter( std::unique_ptr<CompactionFilter> CreateCompactionFilter(
const CompactionFilter::Context& /*context*/) override; const CompactionFilter::Context& context) override;
}; };
class BlobIndexCompactionFilterFactoryGC class BlobIndexCompactionFilterFactoryGC
: public BlobIndexCompactionFilterFactoryBase { : public BlobIndexCompactionFilterFactoryBase {
public: public:
BlobIndexCompactionFilterFactoryGC(BlobDBImpl* _blob_db_impl, Env* _env, BlobIndexCompactionFilterFactoryGC(BlobDBImpl* _blob_db_impl, Env* _env,
const ColumnFamilyOptions& _cf_options,
Statistics* _statistics) Statistics* _statistics)
: BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _env, _statistics) { : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _env, _cf_options,
} _statistics) {}
const char* Name() const override { const char* Name() const override {
return "BlobIndexCompactionFilterFactoryGC"; return "BlobIndexCompactionFilterFactoryGC";
} }
std::unique_ptr<CompactionFilter> CreateCompactionFilter( std::unique_ptr<CompactionFilter> CreateCompactionFilter(
const CompactionFilter::Context& /*context*/) override; const CompactionFilter::Context& context) override;
}; };
} // namespace blob_db } // namespace blob_db

@ -137,11 +137,6 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
return Status::NotSupported("No blob directory in options"); return Status::NotSupported("No blob directory in options");
} }
if (cf_options_.compaction_filter != nullptr ||
cf_options_.compaction_filter_factory != nullptr) {
return Status::NotSupported("Blob DB doesn't support compaction filter.");
}
if (bdb_options_.garbage_collection_cutoff < 0.0 || if (bdb_options_.garbage_collection_cutoff < 0.0 ||
bdb_options_.garbage_collection_cutoff > 1.0) { bdb_options_.garbage_collection_cutoff > 1.0) {
return Status::InvalidArgument( return Status::InvalidArgument(
@ -169,6 +164,12 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
ROCKS_LOG_INFO(db_options_.info_log, "Opening BlobDB..."); ROCKS_LOG_INFO(db_options_.info_log, "Opening BlobDB...");
if ((cf_options_.compaction_filter != nullptr ||
cf_options_.compaction_filter_factory != nullptr)) {
ROCKS_LOG_INFO(db_options_.info_log,
"BlobDB only support compaction filter on non-TTL values.");
}
// Open blob directory. // Open blob directory.
s = env_->CreateDirIfMissing(blob_dir_); s = env_->CreateDirIfMissing(blob_dir_);
if (!s.ok()) { if (!s.ok()) {
@ -194,15 +195,18 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
if (bdb_options_.enable_garbage_collection) { if (bdb_options_.enable_garbage_collection) {
db_options_.listeners.push_back(std::make_shared<BlobDBListenerGC>(this)); db_options_.listeners.push_back(std::make_shared<BlobDBListenerGC>(this));
cf_options_.compaction_filter_factory = cf_options_.compaction_filter_factory =
std::make_shared<BlobIndexCompactionFilterFactoryGC>(this, env_, std::make_shared<BlobIndexCompactionFilterFactoryGC>(
statistics_); this, env_, cf_options_, statistics_);
} else { } else {
db_options_.listeners.push_back(std::make_shared<BlobDBListener>(this)); db_options_.listeners.push_back(std::make_shared<BlobDBListener>(this));
cf_options_.compaction_filter_factory = cf_options_.compaction_filter_factory =
std::make_shared<BlobIndexCompactionFilterFactory>(this, env_, std::make_shared<BlobIndexCompactionFilterFactory>(
statistics_); this, env_, cf_options_, statistics_);
} }
// Reset user compaction filter after building into compaction factory.
cf_options_.compaction_filter = nullptr;
// Open base db. // Open base db.
ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options_); ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options_);
s = DB::Open(db_options_, dbname_, {cf_descriptor}, handles, &db_); s = DB::Open(db_options_, dbname_, {cf_descriptor}, handles, &db_);
@ -811,6 +815,7 @@ Status BlobDBImpl::CreateBlobFileAndWriter(
bool has_ttl, const ExpirationRange& expiration_range, bool has_ttl, const ExpirationRange& expiration_range,
const std::string& reason, std::shared_ptr<BlobFile>* blob_file, const std::string& reason, std::shared_ptr<BlobFile>* blob_file,
std::shared_ptr<Writer>* writer) { std::shared_ptr<Writer>* writer) {
TEST_SYNC_POINT("BlobDBImpl::CreateBlobFileAndWriter");
assert(has_ttl == (expiration_range.first || expiration_range.second)); assert(has_ttl == (expiration_range.first || expiration_range.second));
assert(blob_file); assert(blob_file);
assert(writer); assert(writer);
@ -1137,6 +1142,31 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
return *compression_output; return *compression_output;
} }
Status BlobDBImpl::DecompressSlice(const Slice& compressed_value,
CompressionType compression_type,
PinnableSlice* value_output) const {
if (compression_type != kNoCompression) {
BlockContents contents;
auto cfh = static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
{
StopWatch decompression_sw(env_, statistics_,
BLOB_DB_DECOMPRESSION_MICROS);
UncompressionContext context(compression_type);
UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
compression_type);
Status s = UncompressBlockContentsForCompressionType(
info, compressed_value.data(), compressed_value.size(), &contents,
kBlockBasedTableVersionFormat, *(cfh->cfd()->ioptions()));
if (!s.ok()) {
return Status::Corruption("Unable to decompress blob.");
}
}
value_output->PinSelf(contents.data);
}
return Status::OK();
}
Status BlobDBImpl::CompactFiles( Status BlobDBImpl::CompactFiles(
const CompactionOptions& compact_options, const CompactionOptions& compact_options,
const std::vector<std::string>& input_file_names, const int output_level, const std::vector<std::string>& input_file_names, const int output_level,
@ -1165,10 +1195,10 @@ Status BlobDBImpl::CompactFiles(
return s; return s;
} }
void BlobDBImpl::GetCompactionContextCommon( void BlobDBImpl::GetCompactionContextCommon(BlobCompactionContext* context) {
BlobCompactionContext* context) const {
assert(context); assert(context);
context->blob_db_impl = this;
context->next_file_number = next_file_number_.load(); context->next_file_number = next_file_number_.load();
context->current_blob_files.clear(); context->current_blob_files.clear();
for (auto& p : blob_files_) { for (auto& p : blob_files_) {
@ -1193,8 +1223,6 @@ void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context,
ReadLock l(&mutex_); ReadLock l(&mutex_);
GetCompactionContextCommon(context); GetCompactionContextCommon(context);
context_gc->blob_db_impl = this;
if (!live_imm_non_ttl_blob_files_.empty()) { if (!live_imm_non_ttl_blob_files_.empty()) {
auto it = live_imm_non_ttl_blob_files_.begin(); auto it = live_imm_non_ttl_blob_files_.begin();
std::advance(it, bdb_options_.garbage_collection_cutoff * std::advance(it, bdb_options_.garbage_collection_cutoff *
@ -1418,20 +1446,7 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
} }
if (compression_type != kNoCompression) { if (compression_type != kNoCompression) {
BlockContents contents; s = DecompressSlice(*value, compression_type, value);
auto cfh = static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
{
StopWatch decompression_sw(env_, statistics_,
BLOB_DB_DECOMPRESSION_MICROS);
UncompressionContext context(compression_type);
UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
compression_type);
s = UncompressBlockContentsForCompressionType(
info, value->data(), value->size(), &contents,
kBlockBasedTableVersionFormat, *(cfh->cfd()->ioptions()));
}
if (!s.ok()) { if (!s.ok()) {
if (debug_level_ >= 2) { if (debug_level_ >= 2) {
ROCKS_LOG_ERROR( ROCKS_LOG_ERROR(
@ -1442,11 +1457,8 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
blob_index.file_number(), blob_index.offset(), blob_index.size(), blob_index.file_number(), blob_index.offset(), blob_index.size(),
key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str()); key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
} }
return s;
return Status::Corruption("Unable to uncompress blob.");
} }
value->PinSelf(contents.data);
} }
return Status::OK(); return Status::OK();
@ -1706,6 +1718,7 @@ std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
} }
Status BlobDBImpl::CloseBlobFile(std::shared_ptr<BlobFile> bfile) { Status BlobDBImpl::CloseBlobFile(std::shared_ptr<BlobFile> bfile) {
TEST_SYNC_POINT("BlobDBImpl::CloseBlobFile");
assert(bfile); assert(bfile);
assert(!bfile->Immutable()); assert(!bfile->Immutable());
assert(!bfile->Obsolete()); assert(!bfile->Obsolete());

@ -71,6 +71,7 @@ class BlobDBImpl : public BlobDB {
friend class BlobDBIterator; friend class BlobDBIterator;
friend class BlobDBListener; friend class BlobDBListener;
friend class BlobDBListenerGC; friend class BlobDBListenerGC;
friend class BlobIndexCompactionFilterBase;
friend class BlobIndexCompactionFilterGC; friend class BlobIndexCompactionFilterGC;
public: public:
@ -168,7 +169,7 @@ class BlobDBImpl : public BlobDB {
// Common part of the two GetCompactionContext methods below. // Common part of the two GetCompactionContext methods below.
// REQUIRES: read lock on mutex_ // REQUIRES: read lock on mutex_
void GetCompactionContextCommon(BlobCompactionContext* context) const; void GetCompactionContextCommon(BlobCompactionContext* context);
void GetCompactionContext(BlobCompactionContext* context); void GetCompactionContext(BlobCompactionContext* context);
void GetCompactionContext(BlobCompactionContext* context, void GetCompactionContext(BlobCompactionContext* context,
@ -232,6 +233,10 @@ class BlobDBImpl : public BlobDB {
Slice GetCompressedSlice(const Slice& raw, Slice GetCompressedSlice(const Slice& raw,
std::string* compression_output) const; std::string* compression_output) const;
Status DecompressSlice(const Slice& compressed_value,
CompressionType compression_type,
PinnableSlice* value_output) const;
// Close a file by appending a footer, and removes file from open files list. // Close a file by appending a footer, and removes file from open files list.
// REQUIRES: lock held on write_mutex_, write lock held on both the db mutex_ // REQUIRES: lock held on write_mutex_, write lock held on both the db mutex_
// and the blob file's mutex_. If called on a blob file which is visible only // and the blob file's mutex_. If called on a blob file which is visible only

@ -1111,27 +1111,187 @@ TEST_F(BlobDBTest, InlineSmallValues) {
ASSERT_TRUE(ttl_file->HasTTL()); ASSERT_TRUE(ttl_file->HasTTL());
} }
TEST_F(BlobDBTest, CompactionFilterNotSupported) { TEST_F(BlobDBTest, UserCompactionFilter) {
class TestCompactionFilter : public CompactionFilter { class CustomerFilter : public CompactionFilter {
const char *Name() const override { return "TestCompactionFilter"; } public:
bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value,
std::string *new_value, bool *value_changed) const override {
*value_changed = false;
// changing value size to test value transitions between inlined data
// and stored-in-blob data
if (value.size() % 4 == 1) {
*new_value = value.ToString();
// double size by duplicating value
*new_value += *new_value;
*value_changed = true;
return false;
} else if (value.size() % 3 == 1) {
*new_value = value.ToString();
// trancate value size by half
*new_value = new_value->substr(0, new_value->size() / 2);
*value_changed = true;
return false;
} else if (value.size() % 2 == 1) {
return true;
}
return false;
}
bool IgnoreSnapshots() const override { return true; }
const char *Name() const override { return "CustomerFilter"; }
}; };
class TestCompactionFilterFactory : public CompactionFilterFactory { class CustomerFilterFactory : public CompactionFilterFactory {
const char *Name() const override { return "TestCompactionFilterFactory"; } const char *Name() const override { return "CustomerFilterFactory"; }
std::unique_ptr<CompactionFilter> CreateCompactionFilter( std::unique_ptr<CompactionFilter> CreateCompactionFilter(
const CompactionFilter::Context & /*context*/) override { const CompactionFilter::Context & /*context*/) override {
return std::unique_ptr<CompactionFilter>(new TestCompactionFilter()); return std::unique_ptr<CompactionFilter>(new CustomerFilter());
} }
}; };
for (int i = 0; i < 2; i++) {
constexpr size_t kNumPuts = 1 << 10;
// Generate both inlined and blob value
constexpr uint64_t kMinValueSize = 1 << 6;
constexpr uint64_t kMaxValueSize = 1 << 8;
constexpr uint64_t kMinBlobSize = 1 << 7;
static_assert(kMinValueSize < kMinBlobSize, "");
static_assert(kMaxValueSize > kMinBlobSize, "");
BlobDBOptions bdb_options;
bdb_options.min_blob_size = kMinBlobSize;
bdb_options.blob_file_size = kMaxValueSize * 10;
bdb_options.disable_background_tasks = true;
if (Snappy_Supported()) {
bdb_options.compression = CompressionType::kSnappyCompression;
}
// case_num == 0: Test user defined compaction filter
// case_num == 1: Test user defined compaction filter factory
for (int case_num = 0; case_num < 2; case_num++) {
Options options; Options options;
if (i == 0) { if (case_num == 0) {
options.compaction_filter = new TestCompactionFilter(); options.compaction_filter = new CustomerFilter();
} else { } else {
options.compaction_filter_factory.reset( options.compaction_filter_factory.reset(new CustomerFilterFactory());
new TestCompactionFilterFactory()); }
options.disable_auto_compactions = true;
options.env = mock_env_.get();
options.statistics = CreateDBStatistics();
Open(bdb_options, options);
std::map<std::string, std::string> data;
std::map<std::string, std::string> data_after_compact;
Random rnd(301);
uint64_t value_size = kMinValueSize;
int drop_record = 0;
for (size_t i = 0; i < kNumPuts; ++i) {
std::ostringstream oss;
oss << "key" << std::setw(4) << std::setfill('0') << i;
const std::string key(oss.str());
const std::string value(
test::RandomHumanReadableString(&rnd, (int)value_size));
const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
ASSERT_OK(Put(key, value));
ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
data[key] = value;
if (value.length() % 4 == 1) {
data_after_compact[key] = value + value;
} else if (value.length() % 3 == 1) {
data_after_compact[key] = value.substr(0, value.size() / 2);
} else if (value.length() % 2 == 1) {
++drop_record;
} else {
data_after_compact[key] = value;
}
if (++value_size > kMaxValueSize) {
value_size = kMinValueSize;
}
}
// Verify full data set
VerifyDB(data);
// Applying compaction filter for records
ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
// Verify data after compaction, only value with even length left.
VerifyDB(data_after_compact);
ASSERT_EQ(drop_record,
options.statistics->getTickerCount(COMPACTION_KEY_DROP_USER));
delete options.compaction_filter;
Destroy();
}
}
// Test user comapction filter when there is IO error on blob data.
TEST_F(BlobDBTest, UserCompactionFilter_BlobIOError) {
class CustomerFilter : public CompactionFilter {
public:
bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value,
std::string *new_value, bool *value_changed) const override {
*new_value = value.ToString() + "_new";
*value_changed = true;
return false;
}
bool IgnoreSnapshots() const override { return true; }
const char *Name() const override { return "CustomerFilter"; }
};
constexpr size_t kNumPuts = 100;
constexpr int kValueSize = 100;
BlobDBOptions bdb_options;
bdb_options.min_blob_size = 0;
bdb_options.blob_file_size = kValueSize * 10;
bdb_options.disable_background_tasks = true;
bdb_options.compression = CompressionType::kNoCompression;
std::vector<std::string> io_failure_cases = {
"BlobDBImpl::CreateBlobFileAndWriter",
"BlobIndexCompactionFilterBase::WriteBlobToNewFile",
"BlobDBImpl::CloseBlobFile"};
for (size_t case_num = 0; case_num < io_failure_cases.size(); case_num++) {
Options options;
options.compaction_filter = new CustomerFilter();
options.disable_auto_compactions = true;
options.env = fault_injection_env_.get();
options.statistics = CreateDBStatistics();
Open(bdb_options, options);
std::map<std::string, std::string> data;
Random rnd(301);
for (size_t i = 0; i < kNumPuts; ++i) {
std::ostringstream oss;
oss << "key" << std::setw(4) << std::setfill('0') << i;
const std::string key(oss.str());
const std::string value(
test::RandomHumanReadableString(&rnd, kValueSize));
const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
ASSERT_OK(Put(key, value));
ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
data[key] = value;
} }
ASSERT_TRUE(TryOpen(BlobDBOptions(), options).IsNotSupported());
// Verify full data set
VerifyDB(data);
SyncPoint::GetInstance()->SetCallBack(
io_failure_cases[case_num], [&](void * /*arg*/) {
fault_injection_env_->SetFilesystemActive(false, Status::IOError());
});
SyncPoint::GetInstance()->EnableProcessing();
auto s = blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
ASSERT_TRUE(s.IsIOError());
// Verify full data set after compaction failure
VerifyDB(data);
// Reactivate file system to allow test to close DB.
fault_injection_env_->SetFilesystemActive(true);
SyncPoint::GetInstance()->ClearAllCallBacks();
delete options.compaction_filter; delete options.compaction_filter;
Destroy();
} }
} }
@ -1925,7 +2085,7 @@ TEST_F(BlobDBTest, ShutdownWait) {
{"BlobDBTest.ShutdownWait:3", "BlobDBImpl::EvictExpiredFiles:3"}, {"BlobDBTest.ShutdownWait:3", "BlobDBImpl::EvictExpiredFiles:3"},
}); });
// Force all tasks to be scheduled immediately. // Force all tasks to be scheduled immediately.
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( SyncPoint::GetInstance()->SetCallBack(
"TimeQueue::Add:item.end", [&](void *arg) { "TimeQueue::Add:item.end", [&](void *arg) {
std::chrono::steady_clock::time_point *tp = std::chrono::steady_clock::time_point *tp =
static_cast<std::chrono::steady_clock::time_point *>(arg); static_cast<std::chrono::steady_clock::time_point *>(arg);
@ -1933,7 +2093,7 @@ TEST_F(BlobDBTest, ShutdownWait) {
std::chrono::steady_clock::now() - std::chrono::milliseconds(10000); std::chrono::steady_clock::now() - std::chrono::milliseconds(10000);
}); });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( SyncPoint::GetInstance()->SetCallBack(
"BlobDBImpl::EvictExpiredFiles:cb", [&](void * /*arg*/) { "BlobDBImpl::EvictExpiredFiles:cb", [&](void * /*arg*/) {
// Sleep 3 ms to increase the chance of data race. // Sleep 3 ms to increase the chance of data race.
// We've synced up the code so that EvictExpiredFiles() // We've synced up the code so that EvictExpiredFiles()

@ -27,6 +27,7 @@ class BlobFile {
friend class BlobDBImpl; friend class BlobDBImpl;
friend struct BlobFileComparator; friend struct BlobFileComparator;
friend struct BlobFileComparatorTTL; friend struct BlobFileComparatorTTL;
friend class BlobIndexCompactionFilterBase;
friend class BlobIndexCompactionFilterGC; friend class BlobIndexCompactionFilterGC;
private: private:

@ -0,0 +1,37 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <memory>
#include "rocksdb/compaction_filter.h"
namespace ROCKSDB_NAMESPACE {
// Abstract base class for building layered compation filter on top of
// user compaction filter.
// See BlobIndexCompactionFilter or TtlCompactionFilter for a basic usage.
class LayeredCompactionFilterBase : public CompactionFilter {
public:
LayeredCompactionFilterBase(
const CompactionFilter* _user_comp_filter,
std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory)
: user_comp_filter_(_user_comp_filter),
user_comp_filter_from_factory_(
std::move(_user_comp_filter_from_factory)) {
if (!user_comp_filter_) {
user_comp_filter_ = user_comp_filter_from_factory_.get();
}
}
// Return a pointer to user compaction filter
const CompactionFilter* user_comp_filter() const { return user_comp_filter_; }
private:
const CompactionFilter* user_comp_filter_;
std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory_;
};
} // namespace ROCKSDB_NAMESPACE

@ -17,6 +17,7 @@
#include "rocksdb/merge_operator.h" #include "rocksdb/merge_operator.h"
#include "rocksdb/utilities/db_ttl.h" #include "rocksdb/utilities/db_ttl.h"
#include "rocksdb/utilities/utility_db.h" #include "rocksdb/utilities/utility_db.h"
#include "utilities/compaction_filters/layered_compaction_filter_base.h"
#ifdef _WIN32 #ifdef _WIN32
// Windows API macro interference // Windows API macro interference
@ -148,23 +149,16 @@ class TtlIterator : public Iterator {
Iterator* iter_; Iterator* iter_;
}; };
class TtlCompactionFilter : public CompactionFilter { class TtlCompactionFilter : public LayeredCompactionFilterBase {
public: public:
TtlCompactionFilter( TtlCompactionFilter(int32_t ttl, Env* env,
int32_t ttl, Env* env, const CompactionFilter* user_comp_filter, const CompactionFilter* _user_comp_filter,
std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory = std::unique_ptr<const CompactionFilter>
nullptr) _user_comp_filter_from_factory = nullptr)
: ttl_(ttl), : LayeredCompactionFilterBase(_user_comp_filter,
env_(env), std::move(_user_comp_filter_from_factory)),
user_comp_filter_(user_comp_filter), ttl_(ttl),
user_comp_filter_from_factory_( env_(env) {}
std::move(user_comp_filter_from_factory)) {
// Unlike the merge operator, compaction filter is necessary for TTL, hence
// this would be called even if user doesn't specify any compaction-filter
if (!user_comp_filter_) {
user_comp_filter_ = user_comp_filter_from_factory_.get();
}
}
virtual bool Filter(int level, const Slice& key, const Slice& old_val, virtual bool Filter(int level, const Slice& key, const Slice& old_val,
std::string* new_val, bool* value_changed) const std::string* new_val, bool* value_changed) const
@ -172,13 +166,13 @@ class TtlCompactionFilter : public CompactionFilter {
if (DBWithTTLImpl::IsStale(old_val, ttl_, env_)) { if (DBWithTTLImpl::IsStale(old_val, ttl_, env_)) {
return true; return true;
} }
if (user_comp_filter_ == nullptr) { if (user_comp_filter() == nullptr) {
return false; return false;
} }
assert(old_val.size() >= DBWithTTLImpl::kTSLength); assert(old_val.size() >= DBWithTTLImpl::kTSLength);
Slice old_val_without_ts(old_val.data(), Slice old_val_without_ts(old_val.data(),
old_val.size() - DBWithTTLImpl::kTSLength); old_val.size() - DBWithTTLImpl::kTSLength);
if (user_comp_filter_->Filter(level, key, old_val_without_ts, new_val, if (user_comp_filter()->Filter(level, key, old_val_without_ts, new_val,
value_changed)) { value_changed)) {
return true; return true;
} }
@ -195,8 +189,6 @@ class TtlCompactionFilter : public CompactionFilter {
private: private:
int32_t ttl_; int32_t ttl_;
Env* env_; Env* env_;
const CompactionFilter* user_comp_filter_;
std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory_;
}; };
class TtlCompactionFilterFactory : public CompactionFilterFactory { class TtlCompactionFilterFactory : public CompactionFilterFactory {

Loading…
Cancel
Save