Support ingest_behind for IngestExternalFile

Summary:
First cut for early review; there are few conceptual points to answer and some code structure issues.

For conceptual points -

 - restriction-wise, we're going to disallow ingest_behind if (use_seqno_zero_out=true || disable_auto_compaction=false), the user is responsible to properly open and close DB with required params
 - we wanted to ingest into reserved bottom most level. Should we fail fast if bottom level isn't empty, or should we attempt to ingest if file fits there key-ranges-wise?
 - Modifying AssignLevelForIngestedFile seems the place we we'd handle that.

On code structure - going to refactor GenerateAndAddExternalFile call in the test class to allow passing instance of IngestionOptions, that's just going to incur lots of changes at callsites.
Closes https://github.com/facebook/rocksdb/pull/2144

Differential Revision: D4873732

Pulled By: lightmark

fbshipit-source-id: 81cb698106b68ef8797f564453651d50900e153a
main
Mikhail Antonov 8 years ago committed by Facebook Github Bot
parent 01ab7b528c
commit ba685a472a
  1. 6
      db/column_family.cc
  2. 3
      db/compaction_iterator.cc
  3. 4
      db/compaction_iterator.h
  4. 1
      db/compaction_iterator_test.cc
  5. 6
      db/compaction_picker.cc
  6. 29
      db/compaction_picker_test.cc
  7. 20
      db/compaction_picker_universal.cc
  8. 9
      db/db_impl.cc
  9. 8
      db/db_impl_compaction_flush.cc
  10. 76
      db/external_sst_file_ingestion_job.cc
  11. 11
      db/external_sst_file_ingestion_job.h
  12. 125
      db/external_sst_file_test.cc
  13. 16
      include/rocksdb/db.h
  14. 20
      include/rocksdb/options.h
  15. 1
      options/cf_options.cc
  16. 2
      options/cf_options.h
  17. 6
      options/db_options.cc
  18. 1
      options/db_options.h
  19. 3
      options/options.cc
  20. 2
      options/options_helper.cc
  21. 6
      options/options_helper.h
  22. 3
      options/options_settable_test.cc

@ -172,6 +172,12 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
result.num_levels < 2) { result.num_levels < 2) {
result.num_levels = 2; result.num_levels = 2;
} }
if (result.compaction_style == kCompactionStyleUniversal &&
db_options.allow_ingest_behind && result.num_levels < 3) {
result.num_levels = 3;
}
if (result.max_write_buffer_number < 2) { if (result.max_write_buffer_number < 2) {
result.max_write_buffer_number = 2; result.max_write_buffer_number = 2;
} }

@ -548,7 +548,8 @@ void CompactionIterator::PrepareOutput() {
// This is safe for TransactionDB write-conflict checking since transactions // This is safe for TransactionDB write-conflict checking since transactions
// only care about sequence number larger than any active snapshots. // only care about sequence number larger than any active snapshots.
if (bottommost_level_ && valid_ && ikey_.sequence <= earliest_snapshot_ && if ((compaction_ != nullptr && !compaction_->allow_ingest_behind()) &&
bottommost_level_ && valid_ && ikey_.sequence <= earliest_snapshot_ &&
ikey_.type != kTypeMerge && ikey_.type != kTypeMerge &&
!cmp_->Equal(compaction_->GetLargestUserKey(), ikey_.user_key)) { !cmp_->Equal(compaction_->GetLargestUserKey(), ikey_.user_key)) {
assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion); assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion);

@ -18,6 +18,7 @@
#include "db/merge_helper.h" #include "db/merge_helper.h"
#include "db/pinned_iterators_manager.h" #include "db/pinned_iterators_manager.h"
#include "db/range_del_aggregator.h" #include "db/range_del_aggregator.h"
#include "options/cf_options.h"
#include "rocksdb/compaction_filter.h" #include "rocksdb/compaction_filter.h"
namespace rocksdb { namespace rocksdb {
@ -48,6 +49,9 @@ class CompactionIterator {
virtual Slice GetLargestUserKey() const { virtual Slice GetLargestUserKey() const {
return compaction_->GetLargestUserKey(); return compaction_->GetLargestUserKey();
} }
virtual bool allow_ingest_behind() const {
return compaction_->immutable_cf_options()->allow_ingest_behind;
}
protected: protected:
CompactionProxy() = default; CompactionProxy() = default;

@ -156,6 +156,7 @@ class FakeCompaction : public CompactionIterator::CompactionProxy {
virtual Slice GetLargestUserKey() const { virtual Slice GetLargestUserKey() const {
return "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; return "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
} }
virtual bool allow_ingest_behind() const { return false; }
bool key_not_exists_beyond_output_level = false; bool key_not_exists_beyond_output_level = false;
}; };

@ -529,7 +529,11 @@ Compaction* CompactionPicker::CompactRange(
// files together to the last level. // files together to the last level.
assert(vstorage->num_levels() > 1); assert(vstorage->num_levels() > 1);
// DBImpl::CompactRange() set output level to be the last level // DBImpl::CompactRange() set output level to be the last level
assert(output_level == vstorage->num_levels() - 1); if (ioptions_.allow_ingest_behind) {
assert(output_level == vstorage->num_levels() - 2);
} else {
assert(output_level == vstorage->num_levels() - 1);
}
// DBImpl::RunManualCompaction will make full range for universal compaction // DBImpl::RunManualCompaction will make full range for universal compaction
assert(begin == nullptr); assert(begin == nullptr);
assert(end == nullptr); assert(end == nullptr);

@ -402,6 +402,35 @@ TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
vstorage_->CompactionScore(0) >= 1); vstorage_->CompactionScore(0) >= 1);
} }
} }
TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
const uint64_t kFileSize = 100000;
NewVersionStorage(1, kCompactionStyleUniversal);
ioptions_.allow_ingest_behind = true;
ioptions_.num_levels = 3;
UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
// must return false when there's no files.
ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
false);
NewVersionStorage(3, kCompactionStyleUniversal);
Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(
universal_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
// output level should be the one above the bottom-most
ASSERT_EQ(1, compaction->output_level());
}
// Tests if the files can be trivially moved in multi level // Tests if the files can be trivially moved in multi level
// universal compaction when allow_trivial_move option is set // universal compaction when allow_trivial_move option is set
// In this test as the input files overlaps, they cannot // In this test as the input files overlaps, they cannot

@ -568,6 +568,13 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
output_level = sorted_runs[first_index_after].level - 1; output_level = sorted_runs[first_index_after].level - 1;
} }
// last level is reserved for the files ingested behind
if (ioptions_.allow_ingest_behind &&
(output_level == vstorage->num_levels() - 1)) {
assert(output_level > 1);
output_level--;
}
std::vector<CompactionInputFiles> inputs(vstorage->num_levels()); std::vector<CompactionInputFiles> inputs(vstorage->num_levels());
for (size_t i = 0; i < inputs.size(); ++i) { for (size_t i = 0; i < inputs.size(); ++i) {
inputs[i].level = start_level + static_cast<int>(i); inputs[i].level = start_level + static_cast<int>(i);
@ -719,13 +726,20 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
cf_name.c_str(), file_num_buf); cf_name.c_str(), file_num_buf);
} }
// output files at the bottom most level, unless it's reserved
int output_level = vstorage->num_levels() - 1;
// last level is reserved for the files ingested behind
if (ioptions_.allow_ingest_behind) {
assert(output_level > 1);
output_level--;
}
return new Compaction( return new Compaction(
vstorage, ioptions_, mutable_cf_options, std::move(inputs), vstorage, ioptions_, mutable_cf_options, std::move(inputs),
vstorage->num_levels() - 1, output_level, mutable_cf_options.MaxFileSizeForLevel(output_level),
mutable_cf_options.MaxFileSizeForLevel(vstorage->num_levels() - 1),
/* max_grandparent_overlap_bytes */ LLONG_MAX, path_id, /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
GetCompressionType(ioptions_, vstorage, mutable_cf_options, GetCompressionType(ioptions_, vstorage, mutable_cf_options,
vstorage->num_levels() - 1, 1), output_level, 1),
/* grandparents */ {}, /* is manual */ false, score, /* grandparents */ {}, /* is manual */ false, score,
false /* deletion_compaction */, false /* deletion_compaction */,
CompactionReason::kUniversalSizeAmplification); CompactionReason::kUniversalSizeAmplification);

@ -2614,6 +2614,15 @@ Status DBImpl::IngestExternalFile(
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family); auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd(); auto cfd = cfh->cfd();
// Ingest should immediately fail if ingest_behind is requested,
// but the DB doesn't support it.
if (ingestion_options.ingest_behind) {
if (!immutable_db_options_.allow_ingest_behind) {
return Status::InvalidArgument(
"Can't ingest_behind file in DB with allow_ingest_behind=false");
}
}
ExternalSstFileIngestionJob ingestion_job(env_, versions_.get(), cfd, ExternalSstFileIngestionJob ingestion_job(env_, versions_.get(), cfd,
immutable_db_options_, env_options_, immutable_db_options_, env_options_,
&snapshots_, ingestion_options); &snapshots_, ingestion_options);

@ -284,10 +284,14 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal && if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
cfd->NumberLevels() > 1) { cfd->NumberLevels() > 1) {
// Always compact all files together. // Always compact all files together.
final_output_level = cfd->NumberLevels() - 1;
// if bottom most level is reserved
if (immutable_db_options_.allow_ingest_behind) {
final_output_level--;
}
s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
cfd->NumberLevels() - 1, options.target_path_id, final_output_level, options.target_path_id,
begin, end, exclusive); begin, end, exclusive);
final_output_level = cfd->NumberLevels() - 1;
} else { } else {
for (int level = 0; level <= max_level_with_files; level++) { for (int level = 0; level <= max_level_with_files; level++) {
int output_level; int output_level;

@ -171,9 +171,13 @@ Status ExternalSstFileIngestionJob::Run() {
for (IngestedFileInfo& f : files_to_ingest_) { for (IngestedFileInfo& f : files_to_ingest_) {
SequenceNumber assigned_seqno = 0; SequenceNumber assigned_seqno = 0;
status = AssignLevelAndSeqnoForIngestedFile( if (ingestion_options_.ingest_behind) {
super_version, force_global_seqno, cfd_->ioptions()->compaction_style, status = CheckLevelForIngestedBehindFile(&f);
&f, &assigned_seqno); } else {
status = AssignLevelAndSeqnoForIngestedFile(
super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
&f, &assigned_seqno);
}
if (!status.ok()) { if (!status.ok()) {
return status; return status;
} }
@ -408,9 +412,9 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
Arena arena; Arena arena;
ReadOptions ro; ReadOptions ro;
ro.total_order_seek = true; ro.total_order_seek = true;
int target_level = 0; int target_level = 0;
auto* vstorage = cfd_->current()->storage_info(); auto* vstorage = cfd_->current()->storage_info();
for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) { for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
if (lvl > 0 && lvl < vstorage->base_level()) { if (lvl > 0 && lvl < vstorage->base_level()) {
continue; continue;
@ -418,20 +422,8 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
if (vstorage->NumLevelFiles(lvl) > 0) { if (vstorage->NumLevelFiles(lvl) > 0) {
bool overlap_with_level = false; bool overlap_with_level = false;
MergeIteratorBuilder merge_iter_builder(&cfd_->internal_comparator(), status = IngestedFileOverlapWithLevel(sv, file_to_ingest, lvl,
&arena); &overlap_with_level);
RangeDelAggregator range_del_agg(cfd_->internal_comparator(),
{} /* snapshots */);
sv->current->AddIteratorsForLevel(ro, env_options_, &merge_iter_builder,
lvl, &range_del_agg);
if (!range_del_agg.IsEmpty()) {
return Status::NotSupported(
"file ingestion with range tombstones is currently unsupported");
}
ScopedArenaIterator level_iter(merge_iter_builder.Finish());
status = IngestedFileOverlapWithIteratorRange(
file_to_ingest, level_iter.get(), &overlap_with_level);
if (!status.ok()) { if (!status.ok()) {
return status; return status;
} }
@ -478,6 +470,33 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
return status; return status;
} }
Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
IngestedFileInfo* file_to_ingest) {
auto* vstorage = cfd_->current()->storage_info();
// first check if new files fit in the bottommost level
int bottom_lvl = cfd_->NumberLevels() - 1;
if(!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) {
return Status::InvalidArgument(
"Can't ingest_behind file as it doesn't fit "
"at the bottommost level!");
}
// second check if despite allow_ingest_behind=true we still have 0 seqnums
// at some upper level
for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
for (auto file : vstorage->LevelFiles(lvl)) {
if (file->smallest_seqno == 0) {
return Status::InvalidArgument(
"Can't ingest_behind file as despite allow_ingest_behind=true "
"there are files with 0 seqno in database at upper levels!");
}
}
}
file_to_ingest->picked_level = bottom_lvl;
return Status::OK();
}
Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
IngestedFileInfo* file_to_ingest, SequenceNumber seqno) { IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
if (file_to_ingest->original_seqno == seqno) { if (file_to_ingest->original_seqno == seqno) {
@ -564,6 +583,27 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
return true; return true;
} }
Status ExternalSstFileIngestionJob::IngestedFileOverlapWithLevel(
SuperVersion* sv, IngestedFileInfo* file_to_ingest, int lvl,
bool* overlap_with_level) {
Arena arena;
ReadOptions ro;
ro.total_order_seek = true;
MergeIteratorBuilder merge_iter_builder(&cfd_->internal_comparator(),
&arena);
RangeDelAggregator range_del_agg(cfd_->internal_comparator(),
{} /* snapshots */);
sv->current->AddIteratorsForLevel(ro, env_options_, &merge_iter_builder,
lvl, &range_del_agg);
if (!range_del_agg.IsEmpty()) {
return Status::NotSupported(
"file ingestion with range tombstones is currently unsupported");
}
ScopedArenaIterator level_iter(merge_iter_builder.Finish());
return IngestedFileOverlapWithIteratorRange(
file_to_ingest, level_iter.get(), overlap_with_level);
}
} // namespace rocksdb } // namespace rocksdb
#endif // !ROCKSDB_LITE #endif // !ROCKSDB_LITE

@ -125,6 +125,12 @@ class ExternalSstFileIngestionJob {
IngestedFileInfo* file_to_ingest, IngestedFileInfo* file_to_ingest,
SequenceNumber* assigned_seqno); SequenceNumber* assigned_seqno);
// File that we want to ingest behind always goes to the lowest level;
// we just check that it fits in the level, that DB allows ingest_behind,
// and that we don't have 0 seqnums at the upper levels.
// REQUIRES: Mutex held
Status CheckLevelForIngestedBehindFile(IngestedFileInfo* file_to_ingest);
// Set the file global sequence number to `seqno` // Set the file global sequence number to `seqno`
Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest, Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest,
SequenceNumber seqno); SequenceNumber seqno);
@ -135,6 +141,11 @@ class ExternalSstFileIngestionJob {
const IngestedFileInfo* file_to_ingest, InternalIterator* iter, const IngestedFileInfo* file_to_ingest, InternalIterator* iter,
bool* overlap); bool* overlap);
// Check if `file_to_ingest` key range overlap with level
// REQUIRES: Mutex held
Status IngestedFileOverlapWithLevel(SuperVersion* sv,
IngestedFileInfo* file_to_ingest, int lvl, bool* overlap_with_level);
// Check if `file_to_ingest` can fit in level `level` // Check if `file_to_ingest` can fit in level `level`
// REQUIRES: Mutex held // REQUIRES: Mutex held
bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest, bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest,

@ -90,6 +90,68 @@ class ExternalSSTFileTest : public DBTestBase {
return s; return s;
} }
Status GenerateAndAddExternalFileIngestBehind(
const Options options, const IngestExternalFileOptions ifo,
std::vector<std::pair<std::string, std::string>> data, int file_id = -1,
bool sort_data = false,
std::map<std::string, std::string>* true_data = nullptr,
ColumnFamilyHandle* cfh = nullptr) {
// Generate a file id if not provided
if (file_id == -1) {
file_id = last_file_id_ + 1;
last_file_id_++;
}
// Sort data if asked to do so
if (sort_data) {
std::sort(data.begin(), data.end(),
[&](const std::pair<std::string, std::string>& e1,
const std::pair<std::string, std::string>& e2) {
return options.comparator->Compare(e1.first, e2.first) < 0;
});
auto uniq_iter = std::unique(
data.begin(), data.end(),
[&](const std::pair<std::string, std::string>& e1,
const std::pair<std::string, std::string>& e2) {
return options.comparator->Compare(e1.first, e2.first) == 0;
});
data.resize(uniq_iter - data.begin());
}
std::string file_path = sst_files_dir_ + ToString(file_id);
SstFileWriter sst_file_writer(EnvOptions(), options, cfh);
Status s = sst_file_writer.Open(file_path);
if (!s.ok()) {
return s;
}
for (auto& entry : data) {
s = sst_file_writer.Add(entry.first, entry.second);
if (!s.ok()) {
sst_file_writer.Finish();
return s;
}
}
s = sst_file_writer.Finish();
if (s.ok()) {
if (cfh) {
s = db_->IngestExternalFile(cfh, {file_path}, ifo);
} else {
s = db_->IngestExternalFile({file_path}, ifo);
}
}
if (s.ok() && true_data) {
for (auto& entry : data) {
(*true_data)[entry.first] = entry.second;
}
}
return s;
}
Status GenerateAndAddExternalFile( Status GenerateAndAddExternalFile(
const Options options, std::vector<std::pair<int, std::string>> data, const Options options, std::vector<std::pair<int, std::string>> data,
int file_id = -1, bool allow_global_seqno = false, bool sort_data = false, int file_id = -1, bool allow_global_seqno = false, bool sort_data = false,
@ -1817,6 +1879,69 @@ TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) {
db_->ReleaseSnapshot(snap); db_->ReleaseSnapshot(snap);
} }
TEST_F(ExternalSSTFileTest, IngestBehind) {
Options options = CurrentOptions();
options.compaction_style = kCompactionStyleUniversal;
options.num_levels = 3;
options.disable_auto_compactions = false;
DestroyAndReopen(options);
std::vector<std::pair<std::string, std::string>> file_data;
std::map<std::string, std::string> true_data;
// Insert 100 -> 200 into the memtable
for (int i = 100; i <= 200; i++) {
ASSERT_OK(Put(Key(i), "memtable"));
true_data[Key(i)] = "memtable";
}
// Insert 100 -> 200 using IngestExternalFile
file_data.clear();
for (int i = 0; i <= 20; i++) {
file_data.emplace_back(Key(i), "ingest_behind");
}
IngestExternalFileOptions ifo;
ifo.allow_global_seqno = true;
ifo.ingest_behind = true;
// Can't ingest behind since allow_ingest_behind isn't set to true
ASSERT_NOK(GenerateAndAddExternalFileIngestBehind(options, ifo,
file_data, -1, false,
&true_data));
options.allow_ingest_behind = true;
// check that we still can open the DB, as num_levels should be
// sanitized to 3
options.num_levels = 2;
DestroyAndReopen(options);
options.num_levels = 3;
DestroyAndReopen(options);
// Insert 100 -> 200 into the memtable
for (int i = 100; i <= 200; i++) {
ASSERT_OK(Put(Key(i), "memtable"));
true_data[Key(i)] = "memtable";
}
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
// Universal picker should go at second from the bottom level
ASSERT_EQ("0,1", FilesPerLevel());
ASSERT_OK(GenerateAndAddExternalFileIngestBehind(options, ifo,
file_data, -1, false,
&true_data));
ASSERT_EQ("0,1,1", FilesPerLevel());
// this time ingest should fail as the file doesn't fit to the bottom level
ASSERT_NOK(GenerateAndAddExternalFileIngestBehind(options, ifo,
file_data, -1, false,
&true_data));
ASSERT_EQ("0,1,1", FilesPerLevel());
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
// bottom level should be empty
ASSERT_EQ("0,1", FilesPerLevel());
size_t kcnt = 0;
VerifyDBFromMap(true_data, &kcnt, false);
}
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -944,14 +944,22 @@ class DB {
} }
// IngestExternalFile() will load a list of external SST files (1) into the DB // IngestExternalFile() will load a list of external SST files (1) into the DB
// We will try to find the lowest possible level that the file can fit in, and // Two primary modes are supported:
// ingest the file into this level (2). A file that have a key range that // - Duplicate keys in the new files will overwrite exiting keys (default)
// overlap with the memtable key range will require us to Flush the memtable // - Duplicate keys will be skipped (set ingest_behind=true)
// first before ingesting the file. // In the first mode we will try to find the lowest possible level that
// the file can fit in, and ingest the file into this level (2). A file that
// have a key range that overlap with the memtable key range will require us
// to Flush the memtable first before ingesting the file.
// In the second mode we will always ingest in the bottom mode level (see
// docs to IngestExternalFileOptions::ingest_behind).
// //
// (1) External SST files can be created using SstFileWriter // (1) External SST files can be created using SstFileWriter
// (2) We will try to ingest the files to the lowest possible level // (2) We will try to ingest the files to the lowest possible level
// even if the file compression dont match the level compression // even if the file compression dont match the level compression
// (3) If IngestExternalFileOptions->ingest_behind is set to true,
// we always ingest at the bottommost level, which should be reserved
// for this purpose (see DBOPtions::allow_ingest_behind flag).
virtual Status IngestExternalFile( virtual Status IngestExternalFile(
ColumnFamilyHandle* column_family, ColumnFamilyHandle* column_family,
const std::vector<std::string>& external_files, const std::vector<std::string>& external_files,

@ -847,6 +847,18 @@ struct DBOptions {
// //
// Dynamically changeable through SetDBOptions() API. // Dynamically changeable through SetDBOptions() API.
bool avoid_flush_during_shutdown = false; bool avoid_flush_during_shutdown = false;
// Set this option to true during creation of database if you want
// to be able to ingest behind (call IngestExternalFile() skipping keys
// that already exist, rather than overwriting matching keys).
// Setting this option to true will affect 2 things:
// 1) Disable some internal optimizations around SST file compression
// 2) Reserve bottom-most level for ingested files only.
// 3) Note that num_levels should be >= 3 if this option is turned on.
//
// DEFAULT: false
// Immutable.
bool allow_ingest_behind = false;
}; };
// Options to control the behavior of a database (passed to DB::Open) // Options to control the behavior of a database (passed to DB::Open)
@ -1126,6 +1138,14 @@ struct IngestExternalFileOptions {
// If set to false and the file key range overlaps with the memtable key range // If set to false and the file key range overlaps with the memtable key range
// (memtable flush required), IngestExternalFile will fail. // (memtable flush required), IngestExternalFile will fail.
bool allow_blocking_flush = true; bool allow_blocking_flush = true;
// Set to true if you would like duplicate keys in the file being ingested
// to be skipped rather than overwriting existing data under that key.
// Usecase: back-fill of some historical data in the database without
// over-writing existing newer version of data.
// This option could only be used if the DB has been running
// with allow_ingest_behind=true since the dawn of time.
// All files will be ingested at the bottommost level with seqno=0.
bool ingest_behind = false;
}; };
} // namespace rocksdb } // namespace rocksdb

@ -71,6 +71,7 @@ ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options,
num_levels(cf_options.num_levels), num_levels(cf_options.num_levels),
optimize_filters_for_hits(cf_options.optimize_filters_for_hits), optimize_filters_for_hits(cf_options.optimize_filters_for_hits),
force_consistency_checks(cf_options.force_consistency_checks), force_consistency_checks(cf_options.force_consistency_checks),
allow_ingest_behind(db_options.allow_ingest_behind),
listeners(db_options.listeners), listeners(db_options.listeners),
row_cache(db_options.row_cache), row_cache(db_options.row_cache),
max_subcompactions(db_options.max_subcompactions), max_subcompactions(db_options.max_subcompactions),

@ -108,6 +108,8 @@ struct ImmutableCFOptions {
bool force_consistency_checks; bool force_consistency_checks;
bool allow_ingest_behind;
// A vector of EventListeners which call-back functions will be called // A vector of EventListeners which call-back functions will be called
// when specific RocksDB event happens. // when specific RocksDB event happens.
std::vector<std::shared_ptr<EventListener>> listeners; std::vector<std::shared_ptr<EventListener>> listeners;

@ -84,7 +84,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
fail_if_options_file_error(options.fail_if_options_file_error), fail_if_options_file_error(options.fail_if_options_file_error),
dump_malloc_stats(options.dump_malloc_stats), dump_malloc_stats(options.dump_malloc_stats),
avoid_flush_during_recovery(options.avoid_flush_during_recovery) { avoid_flush_during_recovery(options.avoid_flush_during_recovery),
allow_ingest_behind(options.allow_ingest_behind) {
} }
void ImmutableDBOptions::Dump(Logger* log) const { void ImmutableDBOptions::Dump(Logger* log) const {
@ -210,8 +211,11 @@ void ImmutableDBOptions::Dump(Logger* log) const {
ROCKS_LOG_HEADER(log, " Options.wal_filter: %s", ROCKS_LOG_HEADER(log, " Options.wal_filter: %s",
wal_filter ? wal_filter->Name() : "None"); wal_filter ? wal_filter->Name() : "None");
#endif // ROCKDB_LITE #endif // ROCKDB_LITE
ROCKS_LOG_HEADER(log, " Options.avoid_flush_during_recovery: %d", ROCKS_LOG_HEADER(log, " Options.avoid_flush_during_recovery: %d",
avoid_flush_during_recovery); avoid_flush_during_recovery);
ROCKS_LOG_HEADER(log, " Options.allow_ingest_behind: %d",
allow_ingest_behind);
} }
MutableDBOptions::MutableDBOptions() MutableDBOptions::MutableDBOptions()

@ -77,6 +77,7 @@ struct ImmutableDBOptions {
bool fail_if_options_file_error; bool fail_if_options_file_error;
bool dump_malloc_stats; bool dump_malloc_stats;
bool avoid_flush_during_recovery; bool avoid_flush_during_recovery;
bool allow_ingest_behind;
}; };
struct MutableDBOptions { struct MutableDBOptions {

@ -190,7 +190,8 @@ DBOptions::DBOptions(const Options& options)
fail_if_options_file_error(options.fail_if_options_file_error), fail_if_options_file_error(options.fail_if_options_file_error),
dump_malloc_stats(options.dump_malloc_stats), dump_malloc_stats(options.dump_malloc_stats),
avoid_flush_during_recovery(options.avoid_flush_during_recovery), avoid_flush_during_recovery(options.avoid_flush_during_recovery),
avoid_flush_during_shutdown(options.avoid_flush_during_shutdown) { avoid_flush_during_shutdown(options.avoid_flush_during_shutdown),
allow_ingest_behind(options.allow_ingest_behind) {
} }
void DBOptions::Dump(Logger* log) const { void DBOptions::Dump(Logger* log) const {

@ -119,6 +119,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
immutable_db_options.avoid_flush_during_recovery; immutable_db_options.avoid_flush_during_recovery;
options.avoid_flush_during_shutdown = options.avoid_flush_during_shutdown =
mutable_db_options.avoid_flush_during_shutdown; mutable_db_options.avoid_flush_during_shutdown;
options.allow_ingest_behind =
immutable_db_options.allow_ingest_behind;
return options; return options;
} }

@ -335,7 +335,11 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
{"avoid_flush_during_shutdown", {"avoid_flush_during_shutdown",
{offsetof(struct DBOptions, avoid_flush_during_shutdown), {offsetof(struct DBOptions, avoid_flush_during_shutdown),
OptionType::kBoolean, OptionVerificationType::kNormal, true, OptionType::kBoolean, OptionVerificationType::kNormal, true,
offsetof(struct MutableDBOptions, avoid_flush_during_shutdown)}}}; offsetof(struct MutableDBOptions, avoid_flush_during_shutdown)}},
{"allow_ingest_behind",
{offsetof(struct DBOptions, allow_ingest_behind),
OptionType::kBoolean, OptionVerificationType::kNormal, false,
offsetof(struct ImmutableDBOptions, allow_ingest_behind)}}};
// offset_of is used to get the offset of a class data member // offset_of is used to get the offset of a class data member
// ex: offset_of(&ColumnFamilyOptions::num_levels) // ex: offset_of(&ColumnFamilyOptions::num_levels)

@ -291,7 +291,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
"dump_malloc_stats=false;" "dump_malloc_stats=false;"
"allow_2pc=false;" "allow_2pc=false;"
"avoid_flush_during_recovery=false;" "avoid_flush_during_recovery=false;"
"avoid_flush_during_shutdown=false;", "avoid_flush_during_shutdown=false;"
"allow_ingest_behind=false;",
new_options)); new_options));
ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions), ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),

Loading…
Cancel
Save