diff --git a/.gitignore b/.gitignore index a3a70ee31..995046089 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ build_config.mk *.jar *.*jnilib* *.d-e +*.o-* ldb manifest_dump diff --git a/HISTORY.md b/HISTORY.md index 831d3ccb1..ca65e8c18 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ * Column family support ### Public API changes +* Deprecated ReadOptions.prefix and ReadOptions.prefix_seek. Seek() defaults to prefix-based seek when Options.prefix_extractor is supplied. More detail is documented in https://github.com/facebook/rocksdb/wiki/Prefix-Seek-API-Changes ## 2.8.0 (04/04/2014) diff --git a/Makefile b/Makefile index 80b819304..3e62211f4 100644 --- a/Makefile +++ b/Makefile @@ -195,7 +195,7 @@ check: $(PROGRAMS) $(TESTS) $(TOOLS) ldb_tests: all $(PROGRAMS) $(TESTS) $(TOOLS) python tools/ldb_test.py -crash_test: blackbox_crash_test whitebox_crash_test +crash_test: whitebox_crash_test blackbox_crash_test blackbox_crash_test: db_stress python -u tools/db_crashtest.py diff --git a/db/c.cc b/db/c.cc index e3a0a29a0..915a3e80e 100644 --- a/db/c.cc +++ b/db/c.cc @@ -1230,23 +1230,12 @@ void rocksdb_readoptions_set_fill_cache( opt->rep.fill_cache = v; } -void rocksdb_readoptions_set_prefix_seek( - rocksdb_readoptions_t* opt, unsigned char v) { - opt->rep.prefix_seek = v; -} - void rocksdb_readoptions_set_snapshot( rocksdb_readoptions_t* opt, const rocksdb_snapshot_t* snap) { opt->rep.snapshot = (snap ? snap->rep : nullptr); } -void rocksdb_readoptions_set_prefix( - rocksdb_readoptions_t* opt, const char* key, size_t keylen) { - Slice prefix = Slice(key, keylen); - opt->rep.prefix = &prefix; -} - void rocksdb_readoptions_set_read_tier( rocksdb_readoptions_t* opt, int v) { opt->rep.read_tier = static_cast(v); diff --git a/db/c_test.c b/db/c_test.c index cd9299bec..8ebce9085 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -461,8 +461,6 @@ int main(int argc, char** argv) { rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err); CheckNoError(err); - rocksdb_readoptions_set_prefix_seek(roptions, 1); - rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); CheckCondition(!rocksdb_iter_valid(iter)); diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index b7ec66d96..a8700bbbc 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -180,7 +180,8 @@ bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) { int parent_index = -1; if (c->inputs_[0].empty()) { Log(options_->info_log, - "ExpandWhileOverlapping() failure because zero input files"); + "[%s] ExpandWhileOverlapping() failure because zero input files", + c->column_family_data()->GetName().c_str()); } if (c->inputs_[0].empty() || FilesInCompaction(c->inputs_[0]) || (c->level() != c->output_level() && @@ -275,9 +276,10 @@ void CompactionPicker::SetupOtherInputs(Compaction* c) { if (expanded1.size() == c->inputs_[1].size() && !FilesInCompaction(expanded1)) { Log(options_->info_log, - "Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)" - "\n", - (unsigned long)level, (unsigned long)(c->inputs_[0].size()), + "[%s] Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu " + "bytes)\n", + c->column_family_data()->GetName().c_str(), (unsigned long)level, + (unsigned long)(c->inputs_[0].size()), (unsigned long)(c->inputs_[1].size()), (unsigned long)inputs0_size, (unsigned long)inputs1_size, (unsigned long)(expanded0.size()), (unsigned long)(expanded1.size()), (unsigned long)expanded0_size, @@ -345,7 +347,9 @@ Compaction* CompactionPicker::CompactRange(Version* version, int input_level, c->inputs_[0] = inputs; if (ExpandWhileOverlapping(c) == false) { delete c; - Log(options_->info_log, "Could not compact due to expansion failure.\n"); + Log(options_->info_log, + "[%s] Could not compact due to expansion failure.\n", + version->cfd_->GetName().c_str()); return nullptr; } @@ -515,10 +519,6 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version, nextIndex = i; } - //if (i > Version::number_of_files_to_sort_) { - // Log(options_->info_log, "XXX Looking at index %d", i); - //} - // Do not pick this file if its parents at level+1 are being compacted. // Maybe we can avoid redoing this work in SetupOtherInputs int parent_index = -1; @@ -553,19 +553,21 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version, if ((version->files_[level].size() < (unsigned int)options_->level0_file_num_compaction_trigger)) { - LogToBuffer(log_buffer, "Universal: nothing to do\n"); + LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n", + version->cfd_->GetName().c_str()); return nullptr; } Version::FileSummaryStorage tmp; - LogToBuffer(log_buffer, "Universal: candidate files(%zu): %s\n", - version->files_[level].size(), + LogToBuffer(log_buffer, "[%s] Universal: candidate files(%zu): %s\n", + version->cfd_->GetName().c_str(), version->files_[level].size(), version->LevelFileSummary(&tmp, 0)); // Check for size amplification first. Compaction* c; if ((c = PickCompactionUniversalSizeAmp(version, score, log_buffer)) != nullptr) { - LogToBuffer(log_buffer, "Universal: compacting for size amp\n"); + LogToBuffer(log_buffer, "[%s] Universal: compacting for size amp\n", + version->cfd_->GetName().c_str()); } else { // Size amplification is within limits. Try reducing read // amplification while maintaining file size ratios. @@ -573,7 +575,8 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version, if ((c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX, log_buffer)) != nullptr) { - LogToBuffer(log_buffer, "Universal: compacting for size ratio\n"); + LogToBuffer(log_buffer, "[%s] Universal: compacting for size ratio\n", + version->cfd_->GetName().c_str()); } else { // Size amplification and file size ratios are within configured limits. // If max read amplification is exceeding configured limits, then force @@ -583,7 +586,8 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version, options_->level0_file_num_compaction_trigger; if ((c = PickCompactionUniversalReadAmp( version, score, UINT_MAX, num_files, log_buffer)) != nullptr) { - LogToBuffer(log_buffer, "Universal: compacting for file num\n"); + LogToBuffer(log_buffer, "[%s] Universal: compacting for file num\n", + version->cfd_->GetName().c_str()); } } } @@ -671,9 +675,9 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( candidate_count = 1; break; } - LogToBuffer(log_buffer, - "Universal: file %lu[%d] being compacted, skipping", - (unsigned long)f->number, loop); + LogToBuffer( + log_buffer, "[%s] Universal: file %lu[%d] being compacted, skipping", + version->cfd_->GetName().c_str(), (unsigned long)f->number, loop); f = nullptr; } @@ -681,8 +685,9 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( // first candidate to be compacted. uint64_t candidate_size = f != nullptr? f->file_size : 0; if (f != nullptr) { - LogToBuffer(log_buffer, "Universal: Possible candidate file %lu[%d].", - (unsigned long)f->number, loop); + LogToBuffer( + log_buffer, "[%s] Universal: Possible candidate file %lu[%d].", + version->cfd_->GetName().c_str(), (unsigned long)f->number, loop); } // Check if the suceeding files need compaction. @@ -733,9 +738,9 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( int index = file_by_time[i]; FileMetaData* f = version->files_[level][index]; LogToBuffer(log_buffer, - "Universal: Skipping file %lu[%d] with size %lu %d\n", - (unsigned long)f->number, i, (unsigned long)f->file_size, - f->being_compacted); + "[%s] Universal: Skipping file %lu[%d] with size %lu %d\n", + version->cfd_->GetName().c_str(), (unsigned long)f->number, + i, (unsigned long)f->file_size, f->being_compacted); } } } @@ -769,8 +774,10 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( int index = file_by_time[i]; FileMetaData* f = c->input_version_->files_[level][index]; c->inputs_[0].push_back(f); - LogToBuffer(log_buffer, "Universal: Picking file %lu[%d] with size %lu\n", - (unsigned long)f->number, i, (unsigned long)f->file_size); + LogToBuffer(log_buffer, + "[%s] Universal: Picking file %lu[%d] with size %lu\n", + version->cfd_->GetName().c_str(), (unsigned long)f->number, i, + (unsigned long)f->file_size); } return c; } @@ -806,17 +813,19 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( start_index = loop; // Consider this as the first candidate. break; } - LogToBuffer(log_buffer, "Universal: skipping file %lu[%d] compacted %s", - (unsigned long)f->number, loop, - " cannot be a candidate to reduce size amp.\n"); + LogToBuffer(log_buffer, + "[%s] Universal: skipping file %lu[%d] compacted %s", + version->cfd_->GetName().c_str(), (unsigned long)f->number, + loop, " cannot be a candidate to reduce size amp.\n"); f = nullptr; } if (f == nullptr) { return nullptr; // no candidate files } - LogToBuffer(log_buffer, "Universal: First candidate file %lu[%d] %s", - (unsigned long)f->number, start_index, " to reduce size amp.\n"); + LogToBuffer(log_buffer, "[%s] Universal: First candidate file %lu[%d] %s", + version->cfd_->GetName().c_str(), (unsigned long)f->number, + start_index, " to reduce size amp.\n"); // keep adding up all the remaining files for (unsigned int loop = start_index; loop < file_by_time.size() - 1; @@ -825,8 +834,8 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( f = version->files_[level][index]; if (f->being_compacted) { LogToBuffer( - log_buffer, "Universal: Possible candidate file %lu[%d] %s.", - (unsigned long)f->number, loop, + log_buffer, "[%s] Universal: Possible candidate file %lu[%d] %s.", + version->cfd_->GetName().c_str(), (unsigned long)f->number, loop, " is already being compacted. No size amp reduction possible.\n"); return nullptr; } @@ -843,17 +852,18 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( // size amplification = percentage of additional size if (candidate_size * 100 < ratio * earliest_file_size) { - LogToBuffer(log_buffer, - "Universal: size amp not needed. newer-files-total-size %lu " - "earliest-file-size %lu", - (unsigned long)candidate_size, - (unsigned long)earliest_file_size); + LogToBuffer( + log_buffer, + "[%s] Universal: size amp not needed. newer-files-total-size %lu " + "earliest-file-size %lu", + version->cfd_->GetName().c_str(), (unsigned long)candidate_size, + (unsigned long)earliest_file_size); return nullptr; } else { LogToBuffer(log_buffer, - "Universal: size amp needed. newer-files-total-size %lu " + "[%s] Universal: size amp needed. newer-files-total-size %lu " "earliest-file-size %lu", - (unsigned long)candidate_size, + version->cfd_->GetName().c_str(), (unsigned long)candidate_size, (unsigned long)earliest_file_size); } assert(start_index >= 0 && start_index < file_by_time.size() - 1); @@ -869,8 +879,9 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( f = c->input_version_->files_[level][index]; c->inputs_[0].push_back(f); LogToBuffer(log_buffer, - "Universal: size amp picking file %lu[%d] with size %lu", - (unsigned long)f->number, index, (unsigned long)f->file_size); + "[%s] Universal: size amp picking file %lu[%d] with size %lu", + version->cfd_->GetName().c_str(), (unsigned long)f->number, + index, (unsigned long)f->file_size); } return c; } diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 18da2621a..4726e92b9 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -40,7 +40,7 @@ class CorruptionTest { CorruptionTest() { tiny_cache_ = NewLRUCache(100); options_.env = &env_; - dbname_ = test::TmpDir() + "/db_test"; + dbname_ = test::TmpDir() + "/corruption_test"; DestroyDB(dbname_, options_); db_ = nullptr; @@ -127,24 +127,7 @@ class CorruptionTest { ASSERT_GE(max_expected, correct); } - void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { - // Pick file to corrupt - std::vector filenames; - ASSERT_OK(env_.GetChildren(dbname_, &filenames)); - uint64_t number; - FileType type; - std::string fname; - int picked_number = -1; - for (unsigned int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type) && - type == filetype && - int(number) > picked_number) { // Pick latest file - fname = dbname_ + "/" + filenames[i]; - picked_number = number; - } - } - ASSERT_TRUE(!fname.empty()) << filetype; - + void CorruptFile(const std::string fname, int offset, int bytes_to_corrupt) { struct stat sbuf; if (stat(fname.c_str(), &sbuf) != 0) { const char* msg = strerror(errno); @@ -177,6 +160,42 @@ class CorruptionTest { ASSERT_TRUE(s.ok()) << s.ToString(); } + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector filenames; + ASSERT_OK(env_.GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + std::string fname; + int picked_number = -1; + for (unsigned int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && + type == filetype && + static_cast(number) > picked_number) { // Pick latest file + fname = dbname_ + "/" + filenames[i]; + picked_number = number; + } + } + ASSERT_TRUE(!fname.empty()) << filetype; + + CorruptFile(fname, offset, bytes_to_corrupt); + } + + // corrupts exactly one file at level `level`. if no file found at level, + // asserts + void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + for (const auto& m : metadata) { + if (m.level == level) { + CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt); + return; + } + } + ASSERT_TRUE(false) << "no file found at level"; + } + + int Property(const std::string& name) { std::string property; int result; @@ -331,19 +350,23 @@ TEST(CorruptionTest, CompactionInputErrorParanoid) { Reopen(&options); DBImpl* dbi = reinterpret_cast(db_); - // Fill levels >= 1 so memtable compaction outputs to level 1 + // Fill levels >= 1 so memtable flush outputs to level 0 for (int level = 1; level < dbi->NumberLevels(); level++) { dbi->Put(WriteOptions(), "", "begin"); dbi->Put(WriteOptions(), "~", "end"); dbi->TEST_FlushMemTable(); } + options.max_mem_compaction_level = 0; + Reopen(&options); + + dbi = reinterpret_cast(db_); Build(10); dbi->TEST_FlushMemTable(); dbi->TEST_WaitForCompact(); ASSERT_EQ(1, Property("rocksdb.num-files-at-level0")); - Corrupt(kTableFile, 100, 1); + CorruptTableFileAtLevel(0, 100, 1); Check(9, 9); // Write must eventually fail because of corrupted table diff --git a/db/db_bench.cc b/db/db_bench.cc index ecf40b943..345821596 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -28,11 +28,11 @@ #include "rocksdb/statistics.h" #include "rocksdb/perf_context.h" #include "port/port.h" +#include "port/stack_trace.h" #include "util/crc32c.h" #include "util/histogram.h" #include "util/mutexlock.h" #include "util/random.h" -#include "util/stack_trace.h" #include "util/string_util.h" #include "util/statistics.h" #include "util/testutil.h" @@ -1944,7 +1944,6 @@ class Benchmark { void IteratorCreation(ThreadState* thread) { Duration duration(FLAGS_duration, reads_); ReadOptions options(FLAGS_verify_checksum, true); - options.prefix_seek = (FLAGS_prefix_size > 0); while (!duration.Done(1)) { DB* db = SelectDB(thread); Iterator* iter = db->NewIterator(options); @@ -1966,7 +1965,6 @@ class Benchmark { int64_t found = 0; ReadOptions options(FLAGS_verify_checksum, true); options.tailing = FLAGS_use_tailing_iterator; - options.prefix_seek = (FLAGS_prefix_size > 0); Iterator* single_iter = nullptr; std::vector multi_iters; @@ -2528,7 +2526,7 @@ class Benchmark { } // namespace rocksdb int main(int argc, char** argv) { - rocksdb::InstallStackTraceHandler(); + rocksdb::port::InstallStackTraceHandler(); google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + " [OPTIONS]..."); google::ParseCommandLineFlags(&argc, &argv, true); diff --git a/db/db_impl.cc b/db/db_impl.cc index 44f18fb48..3aa2a2256 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -33,7 +33,6 @@ #include "db/memtable_list.h" #include "db/merge_context.h" #include "db/merge_helper.h" -#include "db/prefix_filter_iterator.h" #include "db/table_cache.h" #include "db/table_properties_collector.h" #include "db/tailing_iter.h" @@ -1339,12 +1338,12 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, FileMetaData meta; meta.number = versions_->NewFileNumber(); pending_outputs_.insert(meta.number); - Iterator* iter = mem->NewIterator(); + Iterator* iter = mem->NewIterator(ReadOptions(), true); const SequenceNumber newest_snapshot = snapshots_.GetNewest(); const SequenceNumber earliest_seqno_in_memtable = mem->GetFirstSequenceNumber(); - Log(options_.info_log, "Level-0 table #%lu: started", - (unsigned long) meta.number); + Log(options_.info_log, "[%s] Level-0 table #%lu: started", + cfd->GetName().c_str(), (unsigned long)meta.number); Status s; { @@ -1357,10 +1356,9 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, mutex_.Lock(); } - Log(options_.info_log, "Level-0 table #%lu: %lu bytes %s", - (unsigned long) meta.number, - (unsigned long) meta.file_size, - s.ToString().c_str()); + Log(options_.info_log, "[%s] Level-0 table #%lu: %lu bytes %s", + cfd->GetName().c_str(), (unsigned long)meta.number, + (unsigned long)meta.file_size, s.ToString().c_str()); delete iter; pending_outputs_.erase(meta.number); @@ -1404,15 +1402,14 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, log_buffer->FlushBufferToLog(); std::vector memtables; for (MemTable* m : mems) { - Log(options_.info_log, - "[CF %u] Flushing memtable with next log file: %lu\n", cfd->GetID(), - (unsigned long)m->GetNextLogNumber()); - memtables.push_back(m->NewIterator()); + Log(options_.info_log, "[%s] Flushing memtable with next log file: %lu\n", + cfd->GetName().c_str(), (unsigned long)m->GetNextLogNumber()); + memtables.push_back(m->NewIterator(ReadOptions(), true)); } Iterator* iter = NewMergingIterator(&cfd->internal_comparator(), &memtables[0], memtables.size()); - Log(options_.info_log, "Level-0 flush table #%lu: started", - (unsigned long)meta.number); + Log(options_.info_log, "[%s] Level-0 flush table #%lu: started", + cfd->GetName().c_str(), (unsigned long)meta.number); s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, cfd->table_cache(), iter, &meta, cfd->internal_comparator(), @@ -1420,10 +1417,13 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, GetCompressionFlush(*cfd->options())); LogFlush(options_.info_log); delete iter; - Log(options_.info_log, "Level-0 flush table #%lu: %lu bytes %s", - (unsigned long) meta.number, - (unsigned long) meta.file_size, - s.ToString().c_str()); + Log(options_.info_log, "[%s] Level-0 flush table #%lu: %lu bytes %s", + cfd->GetName().c_str(), (unsigned long)meta.number, + (unsigned long)meta.file_size, s.ToString().c_str()); + + Version::LevelSummaryStorage tmp; + Log(options_.info_log, "[%s] Level summary: %s\n", cfd->GetName().c_str(), + cfd->current()->LevelSummary(&tmp)); if (!options_.disableDataSync) { db_directory_->Fsync(); } @@ -1483,7 +1483,8 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd, autovector mems; cfd->imm()->PickMemtablesToFlush(&mems); if (mems.empty()) { - LogToBuffer(log_buffer, "Nothing in memstore to flush"); + LogToBuffer(log_buffer, "[%s] Nothing in memtable to flush", + cfd->GetName().c_str()); return Status::OK(); } @@ -1644,7 +1645,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { Status status; if (to_level < level) { - Log(options_.info_log, "Before refitting:\n%s", + Log(options_.info_log, "[%s] Before refitting:\n%s", cfd->GetName().c_str(), cfd->current()->DebugString().data()); VersionEdit edit; @@ -1654,18 +1655,19 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); } - Log(options_.info_log, "Apply version edit:\n%s", - edit.DebugString().data()); + Log(options_.info_log, "[%s] Apply version edit:\n%s", + cfd->GetName().c_str(), edit.DebugString().data()); status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get()); superversion_to_free = cfd->InstallSuperVersion(new_superversion, &mutex_); new_superversion = nullptr; - Log(options_.info_log, "LogAndApply: %s\n", status.ToString().data()); + Log(options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(), + status.ToString().data()); if (status.ok()) { - Log(options_.info_log, "After refitting:\n%s", - cfd->current()->DebugString().data()); + Log(options_.info_log, "[%s] After refitting:\n%s", + cfd->GetName().c_str(), cfd->current()->DebugString().data()); } } @@ -1752,12 +1754,14 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, ++bg_manual_only_; while (bg_compaction_scheduled_ > 0) { Log(options_.info_log, - "Manual compaction waiting for all other scheduled background " - "compactions to finish"); + "[%s] Manual compaction waiting for all other scheduled background " + "compactions to finish", + cfd->GetName().c_str()); bg_cv_.Wait(); } - Log(options_.info_log, "Manual compaction starting"); + Log(options_.info_log, "[%s] Manual compaction starting", + cfd->GetName().c_str()); while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) { assert(bg_manual_only_ > 0); @@ -1874,8 +1878,9 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, LogToBuffer( log_buffer, "BackgroundCallFlush doing FlushMemTableToOutputFile with column " - "family %u, flush slots available %d", - cfd->GetID(), options_.max_background_flushes - bg_flush_scheduled_); + "family [%s], flush slots available %d", + cfd->GetName().c_str(), + options_.max_background_flushes - bg_flush_scheduled_); flush_status = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state, log_buffer); } @@ -1963,8 +1968,6 @@ void DBImpl::BackgroundCallCompaction() { LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get()); { MutexLock l(&mutex_); - // Log(options_.info_log, "XXX BG Thread %llx process new work item", - // pthread_self()); assert(bg_compaction_scheduled_); Status s; if (!shutting_down_.Acquire_Load()) { @@ -2086,16 +2089,15 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, if (!c) { m->done = true; } - LogToBuffer( - log_buffer, - "Manual compaction from level-%d to level-%d from %s .. %s; will stop " - "at %s\n", - m->input_level, m->output_level, - (m->begin ? m->begin->DebugString().c_str() : "(begin)"), - (m->end ? m->end->DebugString().c_str() : "(end)"), - ((m->done || manual_end == nullptr) - ? "(end)" - : manual_end->DebugString().c_str())); + LogToBuffer(log_buffer, + "[%s] Manual compaction from level-%d to level-%d from %s .. " + "%s; will stop at %s\n", + m->cfd->GetName().c_str(), m->input_level, m->output_level, + (m->begin ? m->begin->DebugString().c_str() : "(begin)"), + (m->end ? m->end->DebugString().c_str() : "(end)"), + ((m->done || manual_end == nullptr) + ? "(end)" + : manual_end->DebugString().c_str())); } else { // no need to refcount in iteration since it's always under a mutex for (auto cfd : *versions_->GetColumnFamilySet()) { @@ -2128,10 +2130,12 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, InstallSuperVersion(c->column_family_data(), deletion_state); Version::LevelSummaryStorage tmp; - LogToBuffer(log_buffer, "Moved #%lld to level-%d %lld bytes %s: %s\n", - static_cast(f->number), c->level() + 1, - static_cast(f->file_size), - status.ToString().c_str(), c->input_version()->LevelSummary(&tmp)); + LogToBuffer(log_buffer, "[%s] Moved #%lld to level-%d %lld bytes %s: %s\n", + c->column_family_data()->GetName().c_str(), + static_cast(f->number), c->level() + 1, + static_cast(f->file_size), + status.ToString().c_str(), + c->input_version()->LevelSummary(&tmp)); c->ReleaseCompactionFiles(status); *madeProgress = true; } else { @@ -2235,7 +2239,6 @@ void DBImpl::ReleaseCompactionUnusedFileNumbers(CompactionState* compact) { mutex_.AssertHeld(); for (const auto file_number : compact->allocated_file_numbers) { pending_outputs_.erase(file_number); - // Log(options_.info_log, "XXX releasing unused file num %d", file_number); } } @@ -2334,11 +2337,9 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, s = iter->status(); delete iter; if (s.ok()) { - Log(options_.info_log, - "Generated table #%lu: %lu keys, %lu bytes", - (unsigned long) output_number, - (unsigned long) current_entries, - (unsigned long) current_bytes); + Log(options_.info_log, "[%s] Generated table #%lu: %lu keys, %lu bytes", + cfd->GetName().c_str(), (unsigned long)output_number, + (unsigned long)current_entries, (unsigned long)current_bytes); } } return s; @@ -2354,15 +2355,16 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact, // This ensures that a concurrent compaction did not erroneously // pick the same files to compact. if (!versions_->VerifyCompactionFileConsistency(compact->compaction)) { - Log(options_.info_log, "Compaction %d@%d + %d@%d files aborted", - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->output_level()); + Log(options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted", + compact->compaction->column_family_data()->GetName().c_str(), + compact->compaction->num_input_files(0), compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->output_level()); return Status::Corruption("Compaction input files inconsistent"); } - LogToBuffer(log_buffer, "Compacted %d@%d + %d@%d files => %lld bytes", + LogToBuffer(log_buffer, "[%s] Compacted %d@%d + %d@%d files => %lld bytes", + compact->compaction->column_family_data()->GetName().c_str(), compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), @@ -2620,16 +2622,6 @@ Status DBImpl::ProcessKeyValueCompaction( last_sequence_for_key = ikey.sequence; visible_in_snapshot = visible; } -#if 0 - Log(options_.info_log, - " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " - "%d smallest_snapshot: %d level: %d bottommost %d", - ikey.user_key.ToString().c_str(), - (int)ikey.sequence, ikey.type, kTypeValue, drop, - compact->compaction->IsBaseLevelForKey(ikey.user_key), - (int)last_sequence_for_key, (int)earliest_snapshot, - compact->compaction->level(), bottommost_level); -#endif if (!drop) { // We may write a single key (e.g.: for Put/Delete or successful merge). @@ -2801,14 +2793,15 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, ColumnFamilyData* cfd = compact->compaction->column_family_data(); LogToBuffer( log_buffer, - "[CF %u] Compacting %d@%d + %d@%d files, score %.2f slots available %d", - cfd->GetID(), compact->compaction->num_input_files(0), + "[%s] Compacting %d@%d + %d@%d files, score %.2f slots available %d", + cfd->GetName().c_str(), compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), compact->compaction->output_level(), compact->compaction->score(), options_.max_background_compactions - bg_compaction_scheduled_); char scratch[2345]; compact->compaction->Summary(scratch, sizeof(scratch)); - LogToBuffer(log_buffer, "Compaction start summary: %s\n", scratch); + LogToBuffer(log_buffer, "[%s] Compaction start summary: %s\n", + cfd->GetName().c_str(), scratch); assert(cfd->current()->NumLevelFiles(compact->compaction->level()) > 0); assert(compact->builder == nullptr); @@ -2886,8 +2879,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, } if (!ParseInternalKey(key, &ikey)) { // log error - Log(options_.info_log, "Failed to parse key: %s", - key.ToString().c_str()); + Log(options_.info_log, "[%s] Failed to parse key: %s", + cfd->GetName().c_str(), key.ToString().c_str()); continue; } else { // If the prefix remains the same, keep buffering @@ -3068,10 +3061,10 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, Version::LevelSummaryStorage tmp; LogToBuffer( log_buffer, - "compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) " + "[%s] compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) " "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " "write-amplify(%.1f) %s\n", - cfd->current()->LevelSummary(&tmp), + cfd->GetName().c_str(), cfd->current()->LevelSummary(&tmp), (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) / (double)stats.micros, compact->compaction->output_level(), stats.files_in_leveln, @@ -3409,10 +3402,10 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options, assert(cfd != nullptr); delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_); *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); - Log(options_.info_log, "Created column family \"%s\" (ID %u)", + Log(options_.info_log, "Created column family [%s] (ID %u)", column_family_name.c_str(), (unsigned)cfd->GetID()); } else { - Log(options_.info_log, "Creating column family \"%s\" FAILED -- %s", + Log(options_.info_log, "Creating column family [%s] FAILED -- %s", column_family_name.c_str(), s.ToString().c_str()); } return s; @@ -3500,12 +3493,6 @@ Iterator* DBImpl::NewIterator(const ReadOptions& options, cfd->user_comparator(), iter, snapshot); } - if (options.prefix) { - // use extra wrapper to exclude any keys from the results which - // don't begin with the prefix - iter = new PrefixFilterIterator(iter, *options.prefix, - cfd->options()->prefix_extractor.get()); - } return iter; } @@ -3513,12 +3500,6 @@ Status DBImpl::NewIterators( const ReadOptions& options, const std::vector& column_families, std::vector* iterators) { - - if (options.prefix) { - return Status::NotSupported( - "NewIterators doesn't support ReadOptions::prefix"); - } - iterators->clear(); iterators->reserve(column_families.size()); SequenceNumber latest_snapshot = 0; @@ -3626,10 +3607,13 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { Status status; // refcounting cfd in iteration bool dead_cfd = false; + autovector superversions_to_free; + autovector logs_to_free; for (auto cfd : *versions_->GetColumnFamilySet()) { cfd->Ref(); // May temporarily unlock and wait. - status = MakeRoomForWrite(cfd, my_batch == nullptr); + status = MakeRoomForWrite(cfd, my_batch == nullptr, &superversions_to_free, + &logs_to_free); if (cfd->Unref()) { dead_cfd = true; } @@ -3742,6 +3726,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { writers_.front()->cv.Signal(); } mutex_.Unlock(); + + for (auto& sv : superversions_to_free) { + delete sv; + } + for (auto& log : logs_to_free) { + delete log; + } + PERF_TIMER_STOP(write_pre_and_post_process_time); return status; } @@ -3828,7 +3820,10 @@ uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) { // REQUIRES: mutex_ is held // REQUIRES: this thread is currently at the front of the writer queue -Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) { +Status DBImpl::MakeRoomForWrite( + ColumnFamilyData* cfd, bool force, + autovector* superversions_to_free, + autovector* logs_to_free) { mutex_.AssertHeld(); assert(!writers_.empty()); bool allow_delay = !force; @@ -3878,7 +3873,8 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) { // We have filled up the current memtable, but the previous // ones are still being flushed, so we wait. DelayLoggingAndReset(); - Log(options_.info_log, "wait for memtable flush...\n"); + Log(options_.info_log, "[%s] wait for memtable flush...\n", + cfd->GetName().c_str()); MaybeScheduleFlushOrCompaction(); uint64_t stall; { @@ -3895,7 +3891,8 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) { cfd->options()->level0_stop_writes_trigger) { // There are too many level-0 files. DelayLoggingAndReset(); - Log(options_.info_log, "wait for fewer level0 files...\n"); + Log(options_.info_log, "[%s] wait for fewer level0 files...\n", + cfd->GetName().c_str()); uint64_t stall; { StopWatch sw(env_, options_.statistics.get(), @@ -3996,8 +3993,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) { if (creating_new_log) { logfile_number_ = new_log_number; assert(new_log != nullptr); - // TODO(icanadi) delete outside of mutex - delete log_.release(); + logs_to_free->push_back(log_.release()); log_.reset(new_log); log_empty_ = true; alive_log_files_.push_back(logfile_number_); @@ -4019,13 +4015,12 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) { } new_mem->Ref(); cfd->SetMemtable(new_mem); - Log(options_.info_log, - "[CF %" PRIu32 "] New memtable created with log file: #%lu\n", - cfd->GetID(), (unsigned long)logfile_number_); + Log(options_.info_log, "[%s] New memtable created with log file: #%lu\n", + cfd->GetName().c_str(), (unsigned long)logfile_number_); force = false; // Do not force another compaction if have room MaybeScheduleFlushOrCompaction(); - // TODO(icanadi) delete outside of mutex - delete cfd->InstallSuperVersion(new_superversion, &mutex_); + superversions_to_free->push_back( + cfd->InstallSuperVersion(new_superversion, &mutex_)); } } return s; diff --git a/db/db_impl.h b/db/db_impl.h index 5bc495400..b66d4e558 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -325,7 +325,9 @@ class DBImpl : public DB { // TODO(icanadi) free superversion_to_free and old_log outside of mutex Status MakeRoomForWrite(ColumnFamilyData* cfd, - bool force /* flush even if there is room? */); + bool force /* flush even if there is room? */, + autovector* superversions_to_free, + autovector* logs_to_free); void BuildBatchGroup(Writer** last_writer, autovector* write_batch_group); diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc index 346692beb..3dcde6c40 100644 --- a/db/db_impl_debug.cc +++ b/db/db_impl_debug.cc @@ -33,7 +33,6 @@ Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) { SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); mutex_.Unlock(); ReadOptions roptions; - roptions.prefix_seek = true; return NewInternalIterator(roptions, cfd, super_version); } diff --git a/db/db_test.cc b/db/db_test.cc index f2c665af3..188cfff3d 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -306,7 +306,8 @@ class DBTest { kSkipUniversalCompaction = 2, kSkipMergePut = 4, kSkipPlainTable = 8, - kSkipHashIndex = 16 + kSkipHashIndex = 16, + kSkipNoSeekToLast = 32 }; DBTest() : option_config_(kDefault), @@ -341,6 +342,11 @@ class DBTest { if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) { continue; } + if ((skip_mask & kSkipNoSeekToLast) && + (option_config_ == kHashLinkList || + option_config_ == kHashSkipList)) {; + continue; + } if ((skip_mask & kSkipPlainTable) && (option_config_ == kPlainTableAllBytesPrefix || option_config_ == kPlainTableFirstBytePrefix)) { @@ -862,10 +868,11 @@ class DBTest { void VerifyIterLast(std::string expected_key, int cf = 0) { Iterator* iter; + ReadOptions ro; if (cf == 0) { - iter = db_->NewIterator(ReadOptions()); + iter = db_->NewIterator(ro); } else { - iter = db_->NewIterator(ReadOptions(), handles_[cf]); + iter = db_->NewIterator(ro, handles_[cf]); } iter->SeekToLast(); ASSERT_EQ(IterStatus(iter), expected_key); @@ -1009,12 +1016,28 @@ TEST(DBTest, Empty) { options.write_buffer_size = 100000; // Small write buffer CreateAndReopenWithCF({"pikachu"}, &options); + std::string num; + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("0", num); + ASSERT_OK(Put(1, "foo", "v1")); ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("1", num); env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls Put(1, "k1", std::string(100000, 'x')); // Fill memtable + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("2", num); + Put(1, "k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("1", num); + ASSERT_EQ("v1", Get(1, "foo")); env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls } while (ChangeOptions()); @@ -1447,7 +1470,7 @@ TEST(DBTest, NonBlockingIteration) { // This test verifies block cache behaviors, which is not used by plain // table format. - } while (ChangeOptions(kSkipPlainTable)); + } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast)); } // A delete is skipped for key if KeyMayExist(key) returns False @@ -1891,19 +1914,23 @@ TEST(DBTest, IterSmallAndLargeMix) { TEST(DBTest, IterMultiWithDelete) { do { CreateAndReopenWithCF({"pikachu"}); - ASSERT_OK(Put(1, "a", "va")); - ASSERT_OK(Put(1, "b", "vb")); - ASSERT_OK(Put(1, "c", "vc")); - ASSERT_OK(Delete(1, "b")); - ASSERT_EQ("NOT_FOUND", Get(1, "b")); + ASSERT_OK(Put(1, "ka", "va")); + ASSERT_OK(Put(1, "kb", "vb")); + ASSERT_OK(Put(1, "kc", "vc")); + ASSERT_OK(Delete(1, "kb")); + ASSERT_EQ("NOT_FOUND", Get(1, "kb")); Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->Seek("c"); - ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Seek("kc"); + ASSERT_EQ(IterStatus(iter), "kc->vc"); if (!CurrentOptions().merge_operator) { // TODO: merge operator does not support backward iteration yet - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); + if (kPlainTableAllBytesPrefix != option_config_&& + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_) { + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "ka->va"); + } } delete iter; } while (ChangeOptions()); @@ -1936,7 +1963,7 @@ TEST(DBTest, IterPrevMaxSkip) { ASSERT_OK(Delete(1, "key1")); VerifyIterLast("(invalid)", 1); - } while (ChangeOptions(kSkipMergePut)); + } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast)); } TEST(DBTest, IterWithSnapshot) { @@ -1961,15 +1988,19 @@ TEST(DBTest, IterWithSnapshot) { ASSERT_EQ(IterStatus(iter), "key5->val5"); if (!CurrentOptions().merge_operator) { // TODO: merge operator does not support backward iteration yet - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "key4->val4"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "key3->val3"); + if (kPlainTableAllBytesPrefix != option_config_&& + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_) { + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key3->val3"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "key4->val4"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "key5->val5"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + } iter->Next(); ASSERT_TRUE(!iter->Valid()); } @@ -2225,6 +2256,9 @@ TEST(DBTest, NumImmutableMemTable) { ASSERT_TRUE(dbfull()->GetProperty(handles_[1], "rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); perf_context.Reset(); Get(1, "k1"); ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); @@ -2233,6 +2267,13 @@ TEST(DBTest, NumImmutableMemTable) { ASSERT_TRUE(dbfull()->GetProperty(handles_[1], "rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); + ASSERT_EQ(num, "1"); + perf_context.Reset(); Get(1, "k1"); ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); @@ -2246,6 +2287,12 @@ TEST(DBTest, NumImmutableMemTable) { ASSERT_TRUE(dbfull()->GetProperty(handles_[1], "rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "2"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); + ASSERT_EQ(num, "2"); perf_context.Reset(); Get(1, "k2"); ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); @@ -4374,6 +4421,8 @@ TEST(DBTest, HiddenValuesAreRemoved) { TEST(DBTest, CompactBetweenSnapshots) { do { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; CreateAndReopenWithCF({"pikachu"}); Random rnd(301); FillLevels("a", "z", 1); @@ -5912,7 +5961,7 @@ TEST(DBTest, GroupCommitTest) { ASSERT_TRUE(!itr->Valid()); delete itr; - } while (ChangeOptions()); + } while (ChangeOptions(kSkipNoSeekToLast)); } namespace { @@ -6281,7 +6330,7 @@ TEST(DBTest, Randomized) { } if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); - } while (ChangeOptions(kSkipDeletesFilterFirst)); + } while (ChangeOptions(kSkipDeletesFilterFirst | kSkipNoSeekToLast)); } TEST(DBTest, MultiGetSimple) { @@ -6397,7 +6446,6 @@ void PrefixScanInit(DBTest *dbtest) { } // namespace TEST(DBTest, PrefixScan) { - ReadOptions ro = ReadOptions(); int count; Slice prefix; Slice key; @@ -6418,45 +6466,9 @@ TEST(DBTest, PrefixScan) { options.max_background_compactions = 2; options.create_if_missing = true; options.disable_seek_compaction = true; - // Tricky: options.prefix_extractor will be released by - // NewHashSkipListRepFactory after use. options.memtable_factory.reset(NewHashSkipListRepFactory()); - // prefix specified, with blooms: 2 RAND I/Os - // SeekToFirst - DestroyAndReopen(&options); - PrefixScanInit(this); - count = 0; - env_->random_read_counter_.Reset(); - ro.prefix = &prefix; - iter = db_->NewIterator(ro); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - assert(iter->key().starts_with(prefix)); - count++; - } - ASSERT_OK(iter->status()); - delete iter; - ASSERT_EQ(count, 2); - ASSERT_EQ(env_->random_read_counter_.Read(), 2); - - // prefix specified, with blooms: 2 RAND I/Os - // Seek - DestroyAndReopen(&options); - PrefixScanInit(this); - count = 0; - env_->random_read_counter_.Reset(); - ro.prefix = &prefix; - iter = db_->NewIterator(ro); - for (iter->Seek(key); iter->Valid(); iter->Next()) { - assert(iter->key().starts_with(prefix)); - count++; - } - ASSERT_OK(iter->status()); - delete iter; - ASSERT_EQ(count, 2); - ASSERT_EQ(env_->random_read_counter_.Read(), 2); - - // no prefix specified: 11 RAND I/Os + // 11 RAND I/Os DestroyAndReopen(&options); PrefixScanInit(this); count = 0; @@ -6471,7 +6483,7 @@ TEST(DBTest, PrefixScan) { ASSERT_OK(iter->status()); delete iter; ASSERT_EQ(count, 2); - ASSERT_EQ(env_->random_read_counter_.Read(), 11); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); Close(); delete options.filter_policy; } @@ -6620,7 +6632,6 @@ TEST(DBTest, TailingIteratorDeletes) { TEST(DBTest, TailingIteratorPrefixSeek) { ReadOptions read_options; read_options.tailing = true; - read_options.prefix_seek = true; Options options = CurrentOptions(); options.env = env_; diff --git a/db/dbformat.h b/db/dbformat.h index 27a082284..1c86b127a 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -280,7 +280,7 @@ class IterKey { delete[] key_; } key_ = space_; - buf_size_ = sizeof(buf_size_); + buf_size_ = sizeof(space_); key_size_ = 0; } diff --git a/db/internal_stats.cc b/db/internal_stats.cc index fb5e9b229..e8b22a7f8 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -37,6 +37,10 @@ DBPropertyType GetPropertyType(const Slice& property) { return kBackgroundErrors; } else if (in == "cur-size-active-mem-table") { return kCurSizeActiveMemTable; + } else if (in == "num-entries-active-mem-table") { + return kNumEntriesInMutableMemtable; + } else if (in == "num-entries-imm-mem-tables") { + return kNumEntriesInImmutableMemtable; } return kUnknown; } @@ -349,6 +353,14 @@ bool InternalStats::GetProperty(DBPropertyType property_type, // Current size of the active memtable *value = std::to_string(cfd->mem()->ApproximateMemoryUsage()); return true; + case kNumEntriesInMutableMemtable: + // Current size of the active memtable + *value = std::to_string(cfd->mem()->GetNumEntries()); + return true; + case kNumEntriesInImmutableMemtable: + // Current size of the active memtable + *value = std::to_string(cfd->imm()->current()->GetTotalNumEntries()); + return true; default: return false; } diff --git a/db/internal_stats.h b/db/internal_stats.h index 616b6cc0d..2a743593d 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -28,13 +28,16 @@ enum DBPropertyType { kLevelStats, // Return number of files and total sizes of each level kStats, // Return general statitistics of DB kSsTables, // Return a human readable string of current SST files - kNumImmutableMemTable, // Return number of immutable mem tables - kMemtableFlushPending, // Return 1 if mem table flushing is pending, - // otherwise - // 0. - kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0. - kBackgroundErrors, // Return accumulated background errors encountered. + kNumImmutableMemTable, // Return number of immutable mem tables + kMemtableFlushPending, // Return 1 if mem table flushing is pending, + // otherwise 0. + kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0. + kBackgroundErrors, // Return accumulated background errors encountered. kCurSizeActiveMemTable, // Return current size of the active memtable + kNumEntriesInMutableMemtable, // Return number of entries in the mutable + // memtable. + kNumEntriesInImmutableMemtable, // Return sum of number of entries in all + // the immutable mem tables. kUnknown, }; diff --git a/db/memtable.cc b/db/memtable.cc index 2d12708c3..b13b9f294 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -29,8 +29,7 @@ namespace rocksdb { -MemTable::MemTable(const InternalKeyComparator& cmp, - const Options& options) +MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) : comparator_(cmp), refs_(0), kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)), @@ -38,6 +37,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, arena_(options.arena_block_size), table_(options.memtable_factory->CreateMemTableRep( comparator_, &arena_, options.prefix_extractor.get())), + num_entries_(0), flush_in_progress_(false), flush_completed_(false), file_number_(0), @@ -159,14 +159,12 @@ const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator: public Iterator { public: - MemTableIterator(const MemTable& mem, const ReadOptions& options) + MemTableIterator(const MemTable& mem, const ReadOptions& options, + bool enforce_total_order) : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), - iter_(), valid_(false) { - if (options.prefix) { - iter_.reset(mem.table_->GetPrefixIterator(*options.prefix)); - } else if (options.prefix_seek) { + if (prefix_extractor_ != nullptr && !enforce_total_order) { bloom_ = mem.prefix_bloom_.get(); iter_.reset(mem.table_->GetDynamicPrefixIterator()); } else { @@ -217,7 +215,7 @@ class MemTableIterator: public Iterator { private: DynamicBloom* bloom_; const SliceTransform* const prefix_extractor_; - std::shared_ptr iter_; + std::unique_ptr iter_; bool valid_; // No copying allowed @@ -225,8 +223,9 @@ class MemTableIterator: public Iterator { void operator=(const MemTableIterator&); }; -Iterator* MemTable::NewIterator(const ReadOptions& options) { - return new MemTableIterator(*this, options); +Iterator* MemTable::NewIterator(const ReadOptions& options, + bool enforce_total_order) { + return new MemTableIterator(*this, options, enforce_total_order); } port::RWMutex* MemTable::GetLock(const Slice& key) { @@ -260,6 +259,7 @@ void MemTable::Add(SequenceNumber s, ValueType type, memcpy(p, value.data(), val_size); assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); table_->Insert(handle); + num_entries_++; if (prefix_bloom_) { assert(prefix_extractor_); @@ -477,7 +477,7 @@ bool MemTable::UpdateCallback(SequenceNumber seq, LookupKey lkey(key, seq); Slice memkey = lkey.memtable_key(); - std::shared_ptr iter( + std::unique_ptr iter( table_->GetIterator(lkey.user_key())); iter->Seek(lkey.internal_key(), memkey.data()); diff --git a/db/memtable.h b/db/memtable.h index 3d392820c..a4700f731 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -75,14 +75,10 @@ class MemTable { // iterator are internal keys encoded by AppendInternalKey in the // db/dbformat.{h,cc} module. // - // If options.prefix is supplied, it is passed to the underlying MemTableRep - // as a hint that the iterator only need to support access to keys with that - // specific prefix. - // If options.prefix is not supplied and options.prefix_seek is set, the - // iterator is not bound to a specific prefix. However, the semantics of - // Seek is changed - the result might only include keys with the same prefix - // as the seek-key. - Iterator* NewIterator(const ReadOptions& options = ReadOptions()); + // By default, it returns an iterator for prefix seek if prefix_extractor + // is configured in Options. + Iterator* NewIterator(const ReadOptions& options, + bool enforce_total_order = false); // Add an entry into memtable that maps key to value at the // specified sequence number and with the specified type. @@ -132,6 +128,9 @@ class MemTable { // key in the memtable. size_t CountSuccessiveMergeEntries(const LookupKey& key); + // Get total number of entries in the mem table. + uint64_t GetNumEntries() const { return num_entries_; } + // Returns the edits area that is needed for flushing the memtable VersionEdit* GetEdits() { return &edit_; } @@ -174,6 +173,8 @@ class MemTable { Arena arena_; unique_ptr table_; + uint64_t num_entries_; + // These are used to manage memtable flushes to storage bool flush_in_progress_; // started the flush bool flush_completed_; // finished the flush diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 655ded7f1..235421962 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -78,6 +78,14 @@ void MemTableListVersion::AddIterators(const ReadOptions& options, } } +uint64_t MemTableListVersion::GetTotalNumEntries() const { + uint64_t total_num = 0; + for (auto& m : memlist_) { + total_num += m->GetNumEntries(); + } + return total_num; +} + // caller is responsible for referencing m void MemTableListVersion::Add(MemTable* m) { assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable @@ -176,8 +184,8 @@ Status MemTableList::InstallMemtableFlushResults( break; } - LogToBuffer(log_buffer, "Level-0 commit table #%lu started", - (unsigned long)m->file_number_); + LogToBuffer(log_buffer, "[%s] Level-0 commit table #%lu started", + cfd->GetName().c_str(), (unsigned long)m->file_number_); // this can release and reacquire the mutex. s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory); @@ -191,8 +199,10 @@ Status MemTableList::InstallMemtableFlushResults( uint64_t mem_id = 1; // how many memtables has been flushed. do { if (s.ok()) { // commit new state - LogToBuffer(log_buffer, "Level-0 commit table #%lu: memtable #%lu done", - (unsigned long)m->file_number_, (unsigned long)mem_id); + LogToBuffer(log_buffer, + "[%s] Level-0 commit table #%lu: memtable #%lu done", + cfd->GetName().c_str(), (unsigned long)m->file_number_, + (unsigned long)mem_id); current_->Remove(m); assert(m->file_number_ > 0); diff --git a/db/memtable_list.h b/db/memtable_list.h index 903305779..d85380b55 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -49,6 +49,8 @@ class MemTableListVersion { void AddIterators(const ReadOptions& options, std::vector* iterator_list); + uint64_t GetTotalNumEntries() const; + private: // REQUIRE: m is mutable memtable void Add(MemTable* m); diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 9f836b76e..4cff95952 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -47,7 +47,6 @@ class PlainTableDBTest { public: PlainTableDBTest() : env_(Env::Default()) { - ro_.prefix_seek = true; dbname_ = test::TmpDir() + "/plain_table_db_test"; ASSERT_OK(DestroyDB(dbname_, Options())); db_ = nullptr; @@ -59,8 +58,6 @@ class PlainTableDBTest { ASSERT_OK(DestroyDB(dbname_, Options())); } - ReadOptions ro_; - // Return the current option configuration. Options CurrentOptions() { Options options; @@ -123,7 +120,7 @@ class PlainTableDBTest { } std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { - ReadOptions options = ro_; + ReadOptions options; options.snapshot = snapshot; std::string result; Status s = db_->Get(options, k, &result); @@ -190,7 +187,7 @@ class TestPlainTableReader : public PlainTableReader { file_size, bloom_bits_per_key, hash_table_ratio, index_sparseness, table_properties), expect_bloom_not_match_(expect_bloom_not_match) { - Status s = PopulateIndex(); + Status s = PopulateIndex(const_cast(table_properties)); ASSERT_TRUE(s.ok()); } @@ -265,6 +262,19 @@ TEST(PlainTableDBTest, Flush) { ASSERT_OK(Put("0000000000000bar", "v2")); ASSERT_OK(Put("1000000000000foo", "v3")); dbfull()->TEST_FlushMemTable(); + + TablePropertiesCollection ptc; + reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_EQ(1, ptc.size()); + auto row = ptc.begin(); + auto tp = row->second; + ASSERT_EQ( + total_order ? "4" : "12", + (tp->user_collected_properties).at("plain_table_hash_table_size")); + ASSERT_EQ( + total_order ? "9" : "0", + (tp->user_collected_properties).at("plain_table_sub_index_size")); + ASSERT_EQ("v3", Get("1000000000000foo")); ASSERT_EQ("v2", Get("0000000000000bar")); } @@ -356,7 +366,7 @@ TEST(PlainTableDBTest, Iterator) { dbfull()->TEST_FlushMemTable(); ASSERT_EQ("v1", Get("1000000000foo001")); ASSERT_EQ("v__3", Get("1000000000foo003")); - Iterator* iter = dbfull()->NewIterator(ro_); + Iterator* iter = dbfull()->NewIterator(ReadOptions()); iter->Seek("1000000000foo000"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("1000000000foo001", iter->key().ToString()); @@ -458,7 +468,7 @@ TEST(PlainTableDBTest, IteratorLargeKeys) { dbfull()->TEST_FlushMemTable(); - Iterator* iter = dbfull()->NewIterator(ro_); + Iterator* iter = dbfull()->NewIterator(ReadOptions()); iter->Seek(key_list[0]); for (size_t i = 0; i < 7; i++) { @@ -522,7 +532,7 @@ TEST(PlainTableDBTest, IteratorReverseSuffixComparator) { dbfull()->TEST_FlushMemTable(); ASSERT_EQ("v1", Get("1000000000foo001")); ASSERT_EQ("v__3", Get("1000000000foo003")); - Iterator* iter = dbfull()->NewIterator(ro_); + Iterator* iter = dbfull()->NewIterator(ReadOptions()); iter->Seek("1000000000foo009"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("1000000000foo008", iter->key().ToString()); @@ -753,7 +763,7 @@ TEST(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) { ASSERT_EQ("NOT_FOUND", Get("8000000000000bar")); ASSERT_EQ("NOT_FOUND", Get("1000000000000bar")); - Iterator* iter = dbfull()->NewIterator(ro_); + Iterator* iter = dbfull()->NewIterator(ReadOptions()); iter->Seek("5000000000000bar"); ASSERT_TRUE(iter->Valid()); diff --git a/db/prefix_filter_iterator.h b/db/prefix_filter_iterator.h deleted file mode 100644 index e868c7a54..000000000 --- a/db/prefix_filter_iterator.h +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -// Wrap an underlying iterator, but exclude any results not starting -// with a given prefix. Seeking to keys not beginning with the prefix -// is invalid, and SeekToLast is not implemented (that would be -// non-trivial), but otherwise this iterator will behave just like the -// underlying iterator would if there happened to be no non-matching -// keys in the dataset. - -#pragma once -#include "rocksdb/iterator.h" -#include "rocksdb/slice.h" -#include "rocksdb/slice_transform.h" - -namespace rocksdb { - -class PrefixFilterIterator : public Iterator { - private: - Iterator* iter_; - const Slice &prefix_; - const SliceTransform *prefix_extractor_; - Status status_; - - public: - PrefixFilterIterator(Iterator* iter, - const Slice &prefix, - const SliceTransform* prefix_extractor) - : iter_(iter), prefix_(prefix), - prefix_extractor_(prefix_extractor), - status_(Status::OK()) { - if (prefix_extractor == nullptr) { - status_ = Status::InvalidArgument("A prefix filter may not be used " - "unless a function is also defined " - "for extracting prefixes"); - } else if (!prefix_extractor_->InRange(prefix)) { - status_ = Status::InvalidArgument("Must provide a slice for prefix which" - "is a prefix for some key"); - } - } - ~PrefixFilterIterator() { - delete iter_; - } - Slice key() const { return iter_->key(); } - Slice value() const { return iter_->value(); } - Status status() const { - if (!status_.ok()) { - return status_; - } - return iter_->status(); - } - void Next() { iter_->Next(); } - void Prev() { iter_->Prev(); } - void Seek(const Slice& k) { - if (prefix_extractor_->Transform(k) == prefix_) { - iter_->Seek(k); - } else { - status_ = Status::InvalidArgument("Seek must begin with target prefix"); - } - } - void SeekToFirst() { - Seek(prefix_); - } - void SeekToLast() { - status_ = Status::NotSupported("SeekToLast is incompatible with prefixes"); - } - bool Valid() const { - return (status_.ok() && iter_->Valid() && - prefix_extractor_->Transform(iter_->key()) == prefix_); - } -}; - -} // namespace rocksdb diff --git a/db/prefix_test.cc b/db/prefix_test.cc index c73cf00a6..18036bb93 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -17,7 +17,6 @@ #include "util/stop_watch.h" #include "util/testharness.h" -DEFINE_bool(use_prefix_hash_memtable, true, ""); DEFINE_bool(trigger_deadlock, false, "issue delete in range scan to trigger PrefixHashMap deadlock"); DEFINE_uint64(bucket_count, 100000, "number of buckets"); @@ -208,7 +207,6 @@ TEST(PrefixTest, TestResult) { auto db = OpenDb(); WriteOptions write_options; ReadOptions read_options; - read_options.prefix_seek = true; // 1. Insert one row. Slice v16("v16"); @@ -371,43 +369,6 @@ TEST(PrefixTest, TestResult) { } } -TEST(PrefixTest, FullIterator) { - while (NextOptions(1000000)) { - DestroyDB(kDbName, Options()); - auto db = OpenDb(); - WriteOptions write_options; - - std::vector prefixes; - for (uint64_t i = 0; i < 100; ++i) { - prefixes.push_back(i); - } - std::random_shuffle(prefixes.begin(), prefixes.end()); - - for (auto prefix : prefixes) { - for (uint64_t i = 0; i < 200; ++i) { - TestKey test_key(prefix, i); - Slice key = TestKeyToSlice(test_key); - ASSERT_OK(db->Put(write_options, key, Slice("0"))); - } - } - - auto func = [](void* db_void) { - auto db = reinterpret_cast(db_void); - std::unique_ptr iter(db->NewIterator(ReadOptions())); - iter->SeekToFirst(); - for (int i = 0; i < 3; ++i) { - iter->Next(); - } - }; - - auto env = Env::Default(); - for (int i = 0; i < 16; ++i) { - env->StartThread(func, reinterpret_cast(db.get())); - } - env->WaitForJoin(); - } -} - TEST(PrefixTest, DynamicPrefixIterator) { while (NextOptions(FLAGS_bucket_count)) { std::cout << "*** Mem table: " << options.memtable_factory->Name() @@ -452,9 +413,6 @@ TEST(PrefixTest, DynamicPrefixIterator) { HistogramImpl hist_seek_time; HistogramImpl hist_seek_comparison; - if (FLAGS_use_prefix_hash_memtable) { - read_options.prefix_seek = true; - } std::unique_ptr iter(db->NewIterator(read_options)); for (auto prefix : prefixes) { @@ -464,14 +422,15 @@ TEST(PrefixTest, DynamicPrefixIterator) { perf_context.Reset(); StopWatchNano timer(Env::Default(), true); + auto key_prefix = options.prefix_extractor->Transform(key); uint64_t total_keys = 0; - for (iter->Seek(key); iter->Valid(); iter->Next()) { + for (iter->Seek(key); + iter->Valid() && iter->key().starts_with(key_prefix); + iter->Next()) { if (FLAGS_trigger_deadlock) { std::cout << "Behold the deadlock!\n"; db->Delete(write_options, iter->key()); } - auto test_key = SliceToTestKey(iter->key()); - if (test_key->prefix != prefix) break; total_keys++; } hist_seek_time.Add(timer.ElapsedNanos()); @@ -509,116 +468,6 @@ TEST(PrefixTest, DynamicPrefixIterator) { } } -TEST(PrefixTest, PrefixHash) { - while (NextOptions(FLAGS_bucket_count)) { - std::cout << "*** Mem table: " << options.memtable_factory->Name() - << std::endl; - DestroyDB(kDbName, Options()); - auto db = OpenDb(); - WriteOptions write_options; - ReadOptions read_options; - - std::vector prefixes; - for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) { - prefixes.push_back(i); - } - - if (FLAGS_random_prefix) { - std::random_shuffle(prefixes.begin(), prefixes.end()); - } - - // insert x random prefix, each with y continuous element. - HistogramImpl hist_put_time; - HistogramImpl hist_put_comparison; - - for (auto prefix : prefixes) { - for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { - TestKey test_key(prefix, sorted); - - Slice key = TestKeyToSlice(test_key); - std::string value = "v" + std::to_string(sorted); - - perf_context.Reset(); - StopWatchNano timer(Env::Default(), true); - ASSERT_OK(db->Put(write_options, key, value)); - hist_put_time.Add(timer.ElapsedNanos()); - hist_put_comparison.Add(perf_context.user_key_comparison_count); - } - } - - std::cout << "Put key comparison: \n" << hist_put_comparison.ToString() - << "Put time: \n" << hist_put_time.ToString(); - - - // test seek existing keys - HistogramImpl hist_seek_time; - HistogramImpl hist_seek_comparison; - - for (auto prefix : prefixes) { - TestKey test_key(prefix, 0); - Slice key = TestKeyToSlice(test_key); - std::string value = "v" + std::to_string(0); - - Slice key_prefix; - if (FLAGS_use_prefix_hash_memtable) { - key_prefix = options.prefix_extractor->Transform(key); - read_options.prefix = &key_prefix; - } - std::unique_ptr iter(db->NewIterator(read_options)); - - perf_context.Reset(); - StopWatchNano timer(Env::Default(), true); - uint64_t total_keys = 0; - for (iter->Seek(key); iter->Valid(); iter->Next()) { - if (FLAGS_trigger_deadlock) { - std::cout << "Behold the deadlock!\n"; - db->Delete(write_options, iter->key()); - } - auto test_key = SliceToTestKey(iter->key()); - if (test_key->prefix != prefix) break; - total_keys++; - } - hist_seek_time.Add(timer.ElapsedNanos()); - hist_seek_comparison.Add(perf_context.user_key_comparison_count); - ASSERT_EQ(total_keys, FLAGS_items_per_prefix); - } - - std::cout << "Seek key comparison: \n" - << hist_seek_comparison.ToString() - << "Seek time: \n" - << hist_seek_time.ToString(); - - // test non-existing keys - HistogramImpl hist_no_seek_time; - HistogramImpl hist_no_seek_comparison; - - for (auto prefix = FLAGS_total_prefixes; - prefix < FLAGS_total_prefixes + 100; - prefix++) { - TestKey test_key(prefix, 0); - Slice key = TestKeyToSlice(test_key); - - if (FLAGS_use_prefix_hash_memtable) { - Slice key_prefix = options.prefix_extractor->Transform(key); - read_options.prefix = &key_prefix; - } - std::unique_ptr iter(db->NewIterator(read_options)); - - perf_context.Reset(); - StopWatchNano timer(Env::Default(), true); - iter->Seek(key); - hist_no_seek_time.Add(timer.ElapsedNanos()); - hist_no_seek_comparison.Add(perf_context.user_key_comparison_count); - ASSERT_TRUE(!iter->Valid()); - } - - std::cout << "non-existing Seek key comparison: \n" - << hist_no_seek_comparison.ToString() - << "non-existing Seek time: \n" - << hist_no_seek_time.ToString(); - } -} - } int main(int argc, char** argv) { diff --git a/db/repair.cc b/db/repair.cc index c154c04ac..8ae64b219 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -231,7 +231,8 @@ class Repairer { // since ExtractMetaData() will also generate edits. FileMetaData meta; meta.number = next_file_number_++; - Iterator* iter = mem->NewIterator(); + ReadOptions ro; + Iterator* iter = mem->NewIterator(ro, true /* enforce_total_order */); status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_, iter, &meta, icmp_, 0, 0, kNoCompression); delete iter; diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc index a67114663..affa61465 100644 --- a/db/simple_table_db_test.cc +++ b/db/simple_table_db_test.cc @@ -83,8 +83,6 @@ public: unique_ptr && file, uint64_t file_size, unique_ptr* table_reader); - bool PrefixMayMatch(const Slice& internal_prefix) override; - Iterator* NewIterator(const ReadOptions&) override; Status Get(const ReadOptions&, const Slice& key, void* arg, @@ -220,10 +218,6 @@ std::shared_ptr SimpleTableReader::GetTableProperties() return rep_->table_properties; } -bool SimpleTableReader::PrefixMayMatch(const Slice& internal_prefix) { - return true; -} - Iterator* SimpleTableReader::NewIterator(const ReadOptions& options) { return new SimpleTableIterator(this); } diff --git a/db/table_cache.cc b/db/table_cache.cc index 395951324..2321d035a 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -190,33 +190,6 @@ Status TableCache::GetTableProperties( return s; } -bool TableCache::PrefixMayMatch(const ReadOptions& options, - const InternalKeyComparator& icomparator, - const FileMetaData& file_meta, - const Slice& internal_prefix, bool* table_io) { - bool may_match = true; - auto table_reader = file_meta.table_reader; - Cache::Handle* table_handle = nullptr; - if (table_reader == nullptr) { - // Need to get table handle from file number - Status s = FindTable(storage_options_, icomparator, file_meta.number, - file_meta.file_size, &table_handle, table_io); - if (!s.ok()) { - return may_match; - } - table_reader = GetTableReaderFromHandle(table_handle); - } - - may_match = table_reader->PrefixMayMatch(internal_prefix); - - if (table_handle != nullptr) { - // Need to release handle if it is generated from here. - ReleaseHandle(table_handle); - } - - return may_match; -} - void TableCache::Evict(Cache* cache, uint64_t file_number) { cache->Erase(GetSliceForFileNumber(&file_number)); } diff --git a/db/table_cache.h b/db/table_cache.h index 97e0f6a27..e8cd7ea2e 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -56,13 +56,6 @@ class TableCache { const Slice&, bool), bool* table_io, void (*mark_key_may_exist)(void*) = nullptr); - // Determine whether the table may contain the specified prefix. If - // the table index or blooms are not in memory, this may cause an I/O - bool PrefixMayMatch(const ReadOptions& options, - const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, - const Slice& internal_prefix, bool* table_io); - // Evict any entry for the specified file number static void Evict(Cache* cache, uint64_t file_number); diff --git a/db/tailing_iter.cc b/db/tailing_iter.cc index 41d2b225a..67b59b2c9 100644 --- a/db/tailing_iter.cc +++ b/db/tailing_iter.cc @@ -60,8 +60,8 @@ void TailingIterator::Seek(const Slice& target) { // need to do a seek if 'target' belongs to that interval (i.e. immutable_ is // already at the correct position)! // - // If options.prefix_seek is used and immutable_ is not valid, seek if target - // has a different prefix than prev_key. + // If prefix seek is used and immutable_ is not valid, seek if target has a + // different prefix than prev_key. // // prev_key_ is updated by Next(). SeekImmutable() sets prev_key_ to // 'target' -- in this case, prev_key_ is included in the interval, so @@ -70,7 +70,7 @@ void TailingIterator::Seek(const Slice& target) { const Comparator* cmp = cfd_->user_comparator(); if (!is_prev_set_ || cmp->Compare(prev_key_, target) >= !is_prev_inclusive_ || (immutable_->Valid() && cmp->Compare(target, immutable_->key()) > 0) || - (read_options_.prefix_seek && !IsSamePrefix(target))) { + (cfd_->options()->prefix_extractor != nullptr && !IsSamePrefix(target))) { SeekImmutable(target); } diff --git a/db/tailing_iter.h b/db/tailing_iter.h index a66a85bc5..6b9c51375 100644 --- a/db/tailing_iter.h +++ b/db/tailing_iter.h @@ -2,9 +2,10 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. -#ifndef ROCKSDB_LITE #pragma once +#ifndef ROCKSDB_LITE + #include #include "rocksdb/db.h" @@ -79,7 +80,7 @@ class TailingIterator : public Iterator { bool IsCurrentVersion() const; // check if SeekImmutable() is needed due to target having a different prefix - // than prev_key_ (used when options.prefix_seek is set) + // than prev_key_ (used when in prefix seek mode) bool IsSamePrefix(const Slice& target) const; // creates mutable_ and immutable_ iterators and updates version_number_ diff --git a/db/version_set.cc b/db/version_set.cc index 40a096253..b85094d91 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -31,6 +31,7 @@ #include "table/merger.h" #include "table/two_level_iterator.h" #include "table/format.h" +#include "table/plain_table_factory.h" #include "table/meta_blocks.h" #include "util/coding.h" #include "util/logging.h" @@ -217,58 +218,43 @@ class Version::LevelFileNumIterator : public Iterator { mutable EncodedFileMetaData current_value_; }; -static Iterator* GetFileIterator(void* arg, const ReadOptions& options, - const EnvOptions& soptions, - const InternalKeyComparator& icomparator, - const Slice& file_value, bool for_compaction) { - TableCache* cache = reinterpret_cast(arg); - if (file_value.size() != sizeof(EncodedFileMetaData)) { - return NewErrorIterator( - Status::Corruption("FileReader invoked with unexpected value")); - } else { - ReadOptions options_copy; - if (options.prefix) { - // suppress prefix filtering since we have already checked the - // filters once at this point - options_copy = options; - options_copy.prefix = nullptr; - } - - const EncodedFileMetaData* encoded_meta = - reinterpret_cast(file_value.data()); - FileMetaData meta(encoded_meta->number, encoded_meta->file_size); - meta.table_reader = encoded_meta->table_reader; - return cache->NewIterator( - options.prefix ? options_copy : options, soptions, icomparator, meta, - nullptr /* don't need reference to table*/, for_compaction); +class Version::LevelFileIteratorState : public TwoLevelIteratorState { + public: + LevelFileIteratorState(TableCache* table_cache, + const ReadOptions& read_options, const EnvOptions& env_options, + const InternalKeyComparator& icomparator, bool for_compaction, + bool prefix_enabled) + : TwoLevelIteratorState(prefix_enabled), + table_cache_(table_cache), read_options_(read_options), + env_options_(env_options), icomparator_(icomparator), + for_compaction_(for_compaction) {} + + Iterator* NewSecondaryIterator(const Slice& meta_handle) override { + if (meta_handle.size() != sizeof(EncodedFileMetaData)) { + return NewErrorIterator( + Status::Corruption("FileReader invoked with unexpected value")); + } else { + const EncodedFileMetaData* encoded_meta = + reinterpret_cast(meta_handle.data()); + FileMetaData meta(encoded_meta->number, encoded_meta->file_size); + meta.table_reader = encoded_meta->table_reader; + return table_cache_->NewIterator(read_options_, env_options_, + icomparator_, meta, nullptr /* don't need reference to table*/, + for_compaction_); + } } -} -bool Version::PrefixMayMatch(const ReadOptions& options, - const EnvOptions& soptions, - const Slice& internal_prefix, - Iterator* level_iter) const { - bool may_match = true; - level_iter->Seek(internal_prefix); - if (!level_iter->Valid()) { - // we're past end of level - may_match = false; - } else if (ExtractUserKey(level_iter->key()).starts_with( - ExtractUserKey(internal_prefix))) { - // TODO(tylerharter): do we need this case? Or are we guaranteed - // key() will always be the biggest value for this SST? - may_match = true; - } else { - const EncodedFileMetaData* encoded_meta = - reinterpret_cast( - level_iter->value().data()); - FileMetaData meta(encoded_meta->number, encoded_meta->file_size); - meta.table_reader = encoded_meta->table_reader; - may_match = cfd_->table_cache()->PrefixMayMatch( - options, cfd_->internal_comparator(), meta, internal_prefix, nullptr); - } - return may_match; -} + bool PrefixMayMatch(const Slice& internal_key) override { + return true; + } + + private: + TableCache* table_cache_; + const ReadOptions read_options_; + const EnvOptions& env_options_; + const InternalKeyComparator& icomparator_; + bool for_compaction_; +}; Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { auto table_cache = cfd_->table_cache(); @@ -323,31 +309,13 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { return Status::OK(); } -Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, - const EnvOptions& soptions, - int level) const { - Iterator* level_iter = - new LevelFileNumIterator(cfd_->internal_comparator(), &files_[level]); - if (options.prefix) { - InternalKey internal_prefix(*options.prefix, 0, kTypeValue); - if (!PrefixMayMatch(options, soptions, - internal_prefix.Encode(), level_iter)) { - delete level_iter; - // nothing in this level can match the prefix - return NewEmptyIterator(); - } - } - return NewTwoLevelIterator(level_iter, &GetFileIterator, cfd_->table_cache(), - options, soptions, cfd_->internal_comparator()); -} - -void Version::AddIterators(const ReadOptions& options, +void Version::AddIterators(const ReadOptions& read_options, const EnvOptions& soptions, std::vector* iters) { // Merge all level zero files together since they may overlap for (const FileMetaData* file : files_[0]) { iters->push_back(cfd_->table_cache()->NewIterator( - options, soptions, cfd_->internal_comparator(), *file)); + read_options, soptions, cfd_->internal_comparator(), *file)); } // For levels > 0, we can use a concatenating iterator that sequentially @@ -355,7 +323,11 @@ void Version::AddIterators(const ReadOptions& options, // lazily. for (int level = 1; level < num_levels_; level++) { if (!files_[level].empty()) { - iters->push_back(NewConcatenatingIterator(options, soptions, level)); + iters->push_back(NewTwoLevelIterator(new LevelFileIteratorState( + cfd_->table_cache(), read_options, soptions, + cfd_->internal_comparator(), false /* for_compaction */, + cfd_->options()->prefix_extractor != nullptr), + new LevelFileNumIterator(cfd_->internal_comparator(), &files_[level]))); } } } @@ -767,16 +739,11 @@ void Version::ComputeCompactionScore( // If we are slowing down writes, then we better compact that first if (numfiles >= cfd_->options()->level0_stop_writes_trigger) { score = 1000000; - // Log(options_->info_log, "XXX score l0 = 1000000000 max"); } else if (numfiles >= cfd_->options()->level0_slowdown_writes_trigger) { score = 10000; - // Log(options_->info_log, "XXX score l0 = 1000000 medium"); } else { score = static_cast(numfiles) / cfd_->options()->level0_file_num_compaction_trigger; - if (score >= 1) { - // Log(options_->info_log, "XXX score l0 = %d least", (int)score); - } } } else { // Compute the ratio of current size to size limit. @@ -784,9 +751,6 @@ void Version::ComputeCompactionScore( TotalFileSize(files_[level]) - size_being_compacted[level]; score = static_cast(level_bytes) / cfd_->compaction_picker()->MaxBytesForLevel(level); - if (score > 1) { - // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score); - } if (max_score < score) { max_score = score; max_score_level = level; @@ -1823,8 +1787,9 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, manifest_file_size_ = new_manifest_file_size; prev_log_number_ = edit->prev_log_number_; } else { - Log(options_->info_log, "Error in committing version %lu", - (unsigned long)v->GetVersionNumber()); + Log(options_->info_log, "Error in committing version %lu to [%s]", + (unsigned long)v->GetVersionNumber(), + column_family_data->GetName().c_str()); delete v; if (new_descriptor_log) { descriptor_log_.reset(); @@ -1916,7 +1881,7 @@ Status VersionSet::Recover( return Status::Corruption("CURRENT file corrupted"); } - Log(options_->info_log, "Recovering from manifest file:%s\n", + Log(options_->info_log, "Recovering from manifest file: %s\n", manifest_filename.c_str()); manifest_filename = dbname_ + "/" + manifest_filename; @@ -2162,8 +2127,8 @@ Status VersionSet::Recover( for (auto cfd : *column_family_set_) { Log(options_->info_log, - "Column family \"%s\", log number is %" PRIu64 "\n", - cfd->GetName().c_str(), cfd->GetLogNumber()); + "Column family [%s] (ID %u), log number is %" PRIu64 "\n", + cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); } } @@ -2663,10 +2628,11 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { } Iterator* VersionSet::MakeInputIterator(Compaction* c) { - ReadOptions options; - options.verify_checksums = - c->column_family_data()->options()->verify_checksums_in_compaction; - options.fill_cache = false; + auto cfd = c->column_family_data(); + ReadOptions read_options; + read_options.verify_checksums = + cfd->options()->verify_checksums_in_compaction; + read_options.fill_cache = false; // Level-0 files have to be merged together. For other levels, // we will make a concatenating iterator per level. @@ -2678,20 +2644,19 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { if (!c->inputs(which)->empty()) { if (c->level() + which == 0) { for (const auto& file : *c->inputs(which)) { - list[num++] = c->column_family_data()->table_cache()->NewIterator( - options, storage_options_compactions_, - c->column_family_data()->internal_comparator(), *file, nullptr, + list[num++] = cfd->table_cache()->NewIterator( + read_options, storage_options_compactions_, + cfd->internal_comparator(), *file, nullptr, true /* for compaction */); } } else { // Create concatenating iterator for the files from this level - list[num++] = NewTwoLevelIterator( - new Version::LevelFileNumIterator( - c->column_family_data()->internal_comparator(), - c->inputs(which)), - &GetFileIterator, c->column_family_data()->table_cache(), options, - storage_options_, c->column_family_data()->internal_comparator(), - true /* for compaction */); + list[num++] = NewTwoLevelIterator(new Version::LevelFileIteratorState( + cfd->table_cache(), read_options, storage_options_, + cfd->internal_comparator(), true /* for_compaction */, + false /* prefix enabled */), + new Version::LevelFileNumIterator(cfd->internal_comparator(), + c->inputs(which))); } } } @@ -2708,7 +2673,9 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { #ifndef NDEBUG Version* version = c->column_family_data()->current(); if (c->input_version() != version) { - Log(options_->info_log, "VerifyCompactionFileConsistency version mismatch"); + Log(options_->info_log, + "[%s] VerifyCompactionFileConsistency version mismatch", + c->column_family_data()->GetName().c_str()); } // verify files in level diff --git a/db/version_set.h b/db/version_set.h index 8076e6bc6..c8297f8ec 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -219,11 +219,10 @@ class Version { friend class UniversalCompactionPicker; class LevelFileNumIterator; - Iterator* NewConcatenatingIterator(const ReadOptions&, - const EnvOptions& soptions, - int level) const; - bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions, - const Slice& internal_prefix, Iterator* level_iter) const; + struct LevelFileIteratorState; + + bool PrefixMayMatch(const ReadOptions& options, Iterator* level_iter, + const Slice& internal_prefix) const; // Sort all files for this version based on their file size and // record results in files_by_size_. The largest files are listed first. diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index c2f412c59..febd35c05 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -31,7 +31,7 @@ static std::string PrintContents(WriteBatch* b) { ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default); int count = 0; - Iterator* iter = mem->NewIterator(); + Iterator* iter = mem->NewIterator(ReadOptions()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey ikey; memset((void *)&ikey, 0, sizeof(ikey)); @@ -283,7 +283,7 @@ TEST(WriteBatchTest, PutGatherSlices) { namespace { class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { public: - ColumnFamilyHandleImplDummy(int id) + explicit ColumnFamilyHandleImplDummy(int id) : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {} uint32_t GetID() const override { return id_; } diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 7d4a374d9..7f2c082d0 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -463,13 +463,9 @@ extern void rocksdb_readoptions_set_verify_checksums( unsigned char); extern void rocksdb_readoptions_set_fill_cache( rocksdb_readoptions_t*, unsigned char); -extern void rocksdb_readoptions_set_prefix_seek( - rocksdb_readoptions_t*, unsigned char); extern void rocksdb_readoptions_set_snapshot( rocksdb_readoptions_t*, const rocksdb_snapshot_t*); -extern void rocksdb_readoptions_set_prefix( - rocksdb_readoptions_t*, const char* key, size_t keylen); extern void rocksdb_readoptions_set_read_tier( rocksdb_readoptions_t*, int); extern void rocksdb_readoptions_set_tailing( diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index 0d251a9a6..d23f41b62 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -148,13 +148,6 @@ class MemTableRep { // GetIterator(). virtual Iterator* GetIterator(const Slice& user_key) { return GetIterator(); } - // Return an iterator over at least the keys with the specified prefix. The - // iterator may also allow access to other keys, but doesn't have to. Default: - // GetIterator(). - virtual Iterator* GetPrefixIterator(const Slice& prefix) { - return GetIterator(); - } - // Return an iterator that has a special Seek semantics. The result of // a Seek might only include keys with the same prefix as the target key. virtual Iterator* GetDynamicPrefixIterator() { return GetIterator(); } diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 7f1bf39a9..e95fb557e 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -817,7 +817,10 @@ struct ReadOptions { // If this option is set and memtable implementation allows, Seek // might only return keys with the same prefix as the seek-key - bool prefix_seek; + // + // ! DEPRECATED: prefix_seek is on by default when prefix_extractor + // is configured + // bool prefix_seek; // If "snapshot" is non-nullptr, read as of the supplied snapshot // (which must belong to the DB that is being read and which must @@ -837,7 +840,9 @@ struct ReadOptions { // prefix, and SeekToLast() is not supported. prefix filter with this // option will sometimes reduce the number of read IOPs. // Default: nullptr - const Slice* prefix; + // + // ! DEPRECATED + // const Slice* prefix; // Specify if this read request should process data that ALREADY // resides on a particular cache. If the required data is not @@ -856,17 +861,13 @@ struct ReadOptions { ReadOptions() : verify_checksums(true), fill_cache(true), - prefix_seek(false), snapshot(nullptr), - prefix(nullptr), read_tier(kReadAllTier), tailing(false) {} ReadOptions(bool cksum, bool cache) : verify_checksums(cksum), fill_cache(cache), - prefix_seek(false), snapshot(nullptr), - prefix(nullptr), read_tier(kReadAllTier), tailing(false) {} }; diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index b50007a32..53bf18e87 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -62,9 +62,7 @@ struct BlockBasedTableOptions { kBinarySearch, // The hash index, if enabled, will do the hash lookup when - // `ReadOption.prefix_seek == true`. User should also specify - // `Options.prefix_extractor` to allow the index block to correctly - // extract the prefix of the given key and perform hash table lookup. + // `Options.prefix_extractor` is provided. kHashSearch, }; diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index 55b83f441..aa8b8a0b8 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -23,7 +23,7 @@ namespace rocksdb { // ++pos) { // ... // } -typedef std::map UserCollectedProperties; +typedef std::map UserCollectedProperties; // TableProperties contains a bunch of read-only properties of its associated // table. diff --git a/include/utilities/backupable_db.h b/include/utilities/backupable_db.h index 80f82154d..7c34e08e1 100644 --- a/include/utilities/backupable_db.h +++ b/include/utilities/backupable_db.h @@ -117,6 +117,29 @@ struct BackupInfo { : backup_id(_backup_id), timestamp(_timestamp), size(_size) {} }; +class BackupEngineReadOnly { + public: + virtual ~BackupEngineReadOnly() {} + + static BackupEngineReadOnly* NewReadOnlyBackupEngine( + Env* db_env, const BackupableDBOptions& options); + + // You can GetBackupInfo safely, even with other BackupEngine performing + // backups on the same directory + virtual void GetBackupInfo(std::vector* backup_info) = 0; + + // Restoring DB from backup is NOT safe when there is another BackupEngine + // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's + // responsibility to synchronize the operation, i.e. don't delete the backup + // when you're restoring from it + virtual Status RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; + virtual Status RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; +}; + // Please see the documentation in BackupableDB and RestoreBackupableDB class BackupEngine { public: diff --git a/java/RocksDBSample.java b/java/RocksDBSample.java index f8c1c6038..2e27e9377 100644 --- a/java/RocksDBSample.java +++ b/java/RocksDBSample.java @@ -248,6 +248,5 @@ public class RocksDBSample { // be sure to dispose c++ pointers options.dispose(); readOptions.dispose(); - filter.dispose(); } } diff --git a/java/org/rocksdb/Filter.java b/java/org/rocksdb/Filter.java index 0de392ac6..d16dedc69 100644 --- a/java/org/rocksdb/Filter.java +++ b/java/org/rocksdb/Filter.java @@ -20,7 +20,7 @@ public abstract class Filter { /** * Deletes underlying C++ filter pointer. */ - public synchronized void dispose() { + protected synchronized void dispose() { if(nativeHandle_ != 0) { dispose0(nativeHandle_); } diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java index ff289b776..cfb3c4a3f 100644 --- a/java/org/rocksdb/Options.java +++ b/java/org/rocksdb/Options.java @@ -146,15 +146,21 @@ public class Options { /** * Use the specified filter policy to reduce disk reads. + * + * Note that the caller should not dispose the input filter as + * Options.dispose() will dispose this filter. + * * @param Filter policy java instance. * @return the instance of the current Options. * @see RocksDB.open() */ public Options setFilter(Filter filter) { assert(isInitialized()); - setFilter0(nativeHandle_, filter); + setFilterHandle(nativeHandle_, filter.nativeHandle_); + filter_ = filter; return this; } + private native void setFilterHandle(long optHandle, long filterHandle); /* * Disable compaction triggered by seek. @@ -786,7 +792,8 @@ public class Options { long handle, int limit); /** - * The following two fields affect how archived logs will be deleted. + * WalTtlSeconds() and walSizeLimitMB() affect how archived logs + * will be deleted. * 1. If both set to 0, logs will be deleted asap and will not get into * the archive. * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, @@ -800,6 +807,7 @@ public class Options { * checks will be performed with ttl being first. * * @return the wal-ttl seconds + * @see walSizeLimitMB() */ public long walTtlSeconds() { assert(isInitialized()); @@ -808,7 +816,8 @@ public class Options { private native long walTtlSeconds(long handle); /** - * The following two fields affect how archived logs will be deleted. + * WalTtlSeconds() and walSizeLimitMB() affect how archived logs + * will be deleted. * 1. If both set to 0, logs will be deleted asap and will not get into * the archive. * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, @@ -823,13 +832,64 @@ public class Options { * * @param walTtlSeconds the ttl seconds * @return the reference to the current option. + * @see setWalSizeLimitMB() + */ + public Options setWalTtlSeconds(long walTtlSeconds) { + assert(isInitialized()); + setWalTtlSeconds(nativeHandle_, walTtlSeconds); + return this; + } + private native void setWalTtlSeconds(long handle, long walTtlSeconds); + + /** + * WalTtlSeconds() and walSizeLimitMB() affect how archived logs + * will be deleted. + * 1. If both set to 0, logs will be deleted asap and will not get into + * the archive. + * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + * WAL files will be checked every 10 min and if total size is greater + * then WAL_size_limit_MB, they will be deleted starting with the + * earliest until size_limit is met. All empty files will be deleted. + * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + * WAL files will be checked every WAL_ttl_secondsi / 2 and those that + * are older than WAL_ttl_seconds will be deleted. + * 4. If both are not 0, WAL files will be checked every 10 min and both + * checks will be performed with ttl being first. + * + * @return size limit in mega-bytes. + * @see walSizeLimitMB() + */ + public long walSizeLimitMB() { + assert(isInitialized()); + return walSizeLimitMB(nativeHandle_); + } + private native long walSizeLimitMB(long handle); + + /** + * WalTtlSeconds() and walSizeLimitMB() affect how archived logs + * will be deleted. + * 1. If both set to 0, logs will be deleted asap and will not get into + * the archive. + * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + * WAL files will be checked every 10 min and if total size is greater + * then WAL_size_limit_MB, they will be deleted starting with the + * earliest until size_limit is met. All empty files will be deleted. + * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + * WAL files will be checked every WAL_ttl_secondsi / 2 and those that + * are older than WAL_ttl_seconds will be deleted. + * 4. If both are not 0, WAL files will be checked every 10 min and both + * checks will be performed with ttl being first. + * + * @param sizeLimitMB size limit in mega-bytes. + * @return the reference to the current option. + * @see setWalSizeLimitMB() */ - public Options setWALTtlSeconds(long walTtlSeconds) { + public Options setWalSizeLimitMB(long sizeLimitMB) { assert(isInitialized()); - setWALTtlSeconds(nativeHandle_, walTtlSeconds); + setWalSizeLimitMB(nativeHandle_, sizeLimitMB); return this; } - private native void setWALTtlSeconds(long handle, long walTtlSeconds); + private native void setWalSizeLimitMB(long handle, long sizeLimitMB); /** * Number of bytes to preallocate (via fallocate) the manifest @@ -1199,6 +1259,1054 @@ public class Options { return this; } +/////////////////////////////////////////////////////////////////////// + /** + * Number of keys between restart points for delta encoding of keys. + * This parameter can be changed dynamically. Most clients should + * leave this parameter alone. + * Default: 16 + * + * @return the number of keys between restart points. + */ + public int blockRestartInterval() { + return blockRestartInterval(nativeHandle_); + } + private native int blockRestartInterval(long handle); + + /** + * Number of keys between restart points for delta encoding of keys. + * This parameter can be changed dynamically. Most clients should + * leave this parameter alone. + * Default: 16 + * + * @param blockRestartInterval the number of keys between restart points. + * @return the reference to the current option. + */ + public Options setBlockRestartInterval(int blockRestartInterval) { + setBlockRestartInterval(nativeHandle_, blockRestartInterval); + return this; + } + private native void setBlockRestartInterval( + long handle, int blockRestartInterval); + + /** + * If true, place whole keys in the filter (not just prefixes). + * This must generally be true for gets to be efficient. + * Default: true + * + * @return if true, then whole-key-filtering is on. + */ + public boolean wholeKeyFiltering() { + return wholeKeyFiltering(nativeHandle_); + } + private native boolean wholeKeyFiltering(long handle); + + /** + * If true, place whole keys in the filter (not just prefixes). + * This must generally be true for gets to be efficient. + * Default: true + * + * @param wholeKeyFiltering if true, then whole-key-filtering is on. + * @return the reference to the current option. + */ + public Options setWholeKeyFiltering(boolean wholeKeyFiltering) { + setWholeKeyFiltering(nativeHandle_, wholeKeyFiltering); + return this; + } + private native void setWholeKeyFiltering( + long handle, boolean wholeKeyFiltering); + + /** + * If level-styled compaction is used, then this number determines + * the total number of levels. + * + * @return the number of levels. + */ + public int numLevels() { + return numLevels(nativeHandle_); + } + private native int numLevels(long handle); + + /** + * Set the number of levels for this database + * If level-styled compaction is used, then this number determines + * the total number of levels. + * + * @param numLevels the number of levels. + * @return the reference to the current option. + */ + public Options setNumLevels(int numLevels) { + setNumLevels(nativeHandle_, numLevels); + return this; + } + private native void setNumLevels( + long handle, int numLevels); + + /** + * The number of files in leve 0 to trigger compaction from level-0 to + * level-1. A value < 0 means that level-0 compaction will not be + * triggered by number of files at all. + * Default: 4 + * + * @return the number of files in level 0 to trigger compaction. + */ + public int levelZeroFileNumCompactionTrigger() { + return levelZeroFileNumCompactionTrigger(nativeHandle_); + } + private native int levelZeroFileNumCompactionTrigger(long handle); + + /** + * Number of files to trigger level-0 compaction. A value <0 means that + * level-0 compaction will not be triggered by number of files at all. + * Default: 4 + * + * @param numFiles the number of files in level-0 to trigger compaction. + * @return the reference to the current option. + */ + public Options setLevelZeroFileNumCompactionTrigger( + int numFiles) { + setLevelZeroFileNumCompactionTrigger( + nativeHandle_, numFiles); + return this; + } + private native void setLevelZeroFileNumCompactionTrigger( + long handle, int numFiles); + + /** + * Soft limit on the number of level-0 files. We start slowing down writes + * at this point. A value < 0 means that no writing slow down will be + * triggered by number of files in level-0. + * + * @return the soft limit on the number of level-0 files. + */ + public int levelZeroSlowdownWritesTrigger() { + return levelZeroSlowdownWritesTrigger(nativeHandle_); + } + private native int levelZeroSlowdownWritesTrigger(long handle); + + /** + * Soft limit on number of level-0 files. We start slowing down writes at this + * point. A value <0 means that no writing slow down will be triggered by + * number of files in level-0. + * + * @param numFiles soft limit on number of level-0 files. + * @return the reference to the current option. + */ + public Options setLevelZeroSlowdownWritesTrigger( + int numFiles) { + setLevelZeroSlowdownWritesTrigger(nativeHandle_, numFiles); + return this; + } + private native void setLevelZeroSlowdownWritesTrigger( + long handle, int numFiles); + + /** + * Maximum number of level-0 files. We stop writes at this point. + * + * @return the hard limit of the number of level-0 file. + */ + public int levelZeroStopWritesTrigger() { + return levelZeroStopWritesTrigger(nativeHandle_); + } + private native int levelZeroStopWritesTrigger(long handle); + + /** + * Maximum number of level-0 files. We stop writes at this point. + * + * @param numFiles the hard limit of the number of level-0 files. + * @return the reference to the current option. + */ + public Options setLevelZeroStopWritesTrigger(int numFiles) { + setLevelZeroStopWritesTrigger(nativeHandle_, numFiles); + return this; + } + private native void setLevelZeroStopWritesTrigger( + long handle, int numFiles); + + /** + * The highest level to which a new compacted memtable is pushed if it + * does not create overlap. We try to push to level 2 to avoid the + * relatively expensive level 0=>1 compactions and to avoid some + * expensive manifest file operations. We do not push all the way to + * the largest level since that can generate a lot of wasted disk + * space if the same key space is being repeatedly overwritten. + * + * @return the highest level where a new compacted memtable will be pushed. + */ + public int maxMemCompactionLevel() { + return maxMemCompactionLevel(nativeHandle_); + } + private native int maxMemCompactionLevel(long handle); + + /** + * The highest level to which a new compacted memtable is pushed if it + * does not create overlap. We try to push to level 2 to avoid the + * relatively expensive level 0=>1 compactions and to avoid some + * expensive manifest file operations. We do not push all the way to + * the largest level since that can generate a lot of wasted disk + * space if the same key space is being repeatedly overwritten. + * + * @param maxMemCompactionLevel the highest level to which a new compacted + * mem-table will be pushed. + * @return the reference to the current option. + */ + public Options setMaxMemCompactionLevel(int maxMemCompactionLevel) { + setMaxMemCompactionLevel(nativeHandle_, maxMemCompactionLevel); + return this; + } + private native void setMaxMemCompactionLevel( + long handle, int maxMemCompactionLevel); + + /** + * The target file size for compaction. + * This targetFileSizeBase determines a level-1 file size. + * Target file size for level L can be calculated by + * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1)) + * For example, if targetFileSizeBase is 2MB and + * target_file_size_multiplier is 10, then each file on level-1 will + * be 2MB, and each file on level 2 will be 20MB, + * and each file on level-3 will be 200MB. + * by default targetFileSizeBase is 2MB. + * + * @return the target size of a level-0 file. + * + * @see targetFileSizeMultiplier() + */ + public int targetFileSizeBase() { + return targetFileSizeBase(nativeHandle_); + } + private native int targetFileSizeBase(long handle); + + /** + * The target file size for compaction. + * This targetFileSizeBase determines a level-1 file size. + * Target file size for level L can be calculated by + * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1)) + * For example, if targetFileSizeBase is 2MB and + * target_file_size_multiplier is 10, then each file on level-1 will + * be 2MB, and each file on level 2 will be 20MB, + * and each file on level-3 will be 200MB. + * by default targetFileSizeBase is 2MB. + * + * @param targetFileSizeBase the target size of a level-0 file. + * @return the reference to the current option. + * + * @see setTargetFileSizeMultiplier() + */ + public Options setTargetFileSizeBase(int targetFileSizeBase) { + setTargetFileSizeBase(nativeHandle_, targetFileSizeBase); + return this; + } + private native void setTargetFileSizeBase( + long handle, int targetFileSizeBase); + + /** + * targetFileSizeMultiplier defines the size ratio between a + * level-(L+1) file and level-L file. + * By default targetFileSizeMultiplier is 1, meaning + * files in different levels have the same target. + * + * @return the size ratio between a level-(L+1) file and level-L file. + */ + public int targetFileSizeMultiplier() { + return targetFileSizeMultiplier(nativeHandle_); + } + private native int targetFileSizeMultiplier(long handle); + + /** + * targetFileSizeMultiplier defines the size ratio between a + * level-L file and level-(L+1) file. + * By default target_file_size_multiplier is 1, meaning + * files in different levels have the same target. + * + * @param multiplier the size ratio between a level-(L+1) file + * and level-L file. + * @return the reference to the current option. + */ + public Options setTargetFileSizeMultiplier(int multiplier) { + setTargetFileSizeMultiplier(nativeHandle_, multiplier); + return this; + } + private native void setTargetFileSizeMultiplier( + long handle, int multiplier); + + /** + * The upper-bound of the total size of level-1 files in bytes. + * Maximum number of bytes for level L can be calculated as + * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1)) + * For example, if maxBytesForLevelBase is 20MB, and if + * max_bytes_for_level_multiplier is 10, total data size for level-1 + * will be 20MB, total file size for level-2 will be 200MB, + * and total file size for level-3 will be 2GB. + * by default 'maxBytesForLevelBase' is 10MB. + * + * @return the upper-bound of the total size of leve-1 files in bytes. + * @see maxBytesForLevelMultiplier() + */ + public long maxBytesForLevelBase() { + return maxBytesForLevelBase(nativeHandle_); + } + private native long maxBytesForLevelBase(long handle); + + /** + * The upper-bound of the total size of level-1 files in bytes. + * Maximum number of bytes for level L can be calculated as + * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1)) + * For example, if maxBytesForLevelBase is 20MB, and if + * max_bytes_for_level_multiplier is 10, total data size for level-1 + * will be 20MB, total file size for level-2 will be 200MB, + * and total file size for level-3 will be 2GB. + * by default 'maxBytesForLevelBase' is 10MB. + * + * @return maxBytesForLevelBase the upper-bound of the total size of + * leve-1 files in bytes. + * @return the reference to the current option. + * @see setMaxBytesForLevelMultiplier() + */ + public Options setMaxBytesForLevelBase(long maxBytesForLevelBase) { + setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase); + return this; + } + private native void setMaxBytesForLevelBase( + long handle, long maxBytesForLevelBase); + + /** + * The ratio between the total size of level-(L+1) files and the total + * size of level-L files for all L. + * DEFAULT: 10 + * + * @return the ratio between the total size of level-(L+1) files and + * the total size of level-L files for all L. + * @see maxBytesForLevelBase() + */ + public int maxBytesForLevelMultiplier() { + return maxBytesForLevelMultiplier(nativeHandle_); + } + private native int maxBytesForLevelMultiplier(long handle); + + /** + * The ratio between the total size of level-(L+1) files and the total + * size of level-L files for all L. + * DEFAULT: 10 + * + * @param multiplier the ratio between the total size of level-(L+1) + * files and the total size of level-L files for all L. + * @return the reference to the current option. + * @see setMaxBytesForLevelBase() + */ + public Options setMaxBytesForLevelMultiplier(int multiplier) { + setMaxBytesForLevelMultiplier(nativeHandle_, multiplier); + return this; + } + private native void setMaxBytesForLevelMultiplier( + long handle, int multiplier); + + /** + * Maximum number of bytes in all compacted files. We avoid expanding + * the lower level file set of a compaction if it would make the + * total compaction cover more than + * (expanded_compaction_factor * targetFileSizeLevel()) many bytes. + * + * @return the maximum number of bytes in all compacted files. + * @see sourceCompactionFactor() + */ + public int expandedCompactionFactor() { + return expandedCompactionFactor(nativeHandle_); + } + private native int expandedCompactionFactor(long handle); + + /** + * Maximum number of bytes in all compacted files. We avoid expanding + * the lower level file set of a compaction if it would make the + * total compaction cover more than + * (expanded_compaction_factor * targetFileSizeLevel()) many bytes. + * + * @param expandedCompactionFactor the maximum number of bytes in all + * compacted files. + * @return the reference to the current option. + * @see setSourceCompactionFactor() + */ + public Options setExpandedCompactionFactor(int expandedCompactionFactor) { + setExpandedCompactionFactor(nativeHandle_, expandedCompactionFactor); + return this; + } + private native void setExpandedCompactionFactor( + long handle, int expandedCompactionFactor); + + /** + * Maximum number of bytes in all source files to be compacted in a + * single compaction run. We avoid picking too many files in the + * source level so that we do not exceed the total source bytes + * for compaction to exceed + * (source_compaction_factor * targetFileSizeLevel()) many bytes. + * Default:1, i.e. pick maxfilesize amount of data as the source of + * a compaction. + * + * @return the maximum number of bytes in all source files to be compactedo. + * @see expendedCompactionFactor() + */ + public int sourceCompactionFactor() { + return sourceCompactionFactor(nativeHandle_); + } + private native int sourceCompactionFactor(long handle); + + /** + * Maximum number of bytes in all source files to be compacted in a + * single compaction run. We avoid picking too many files in the + * source level so that we do not exceed the total source bytes + * for compaction to exceed + * (source_compaction_factor * targetFileSizeLevel()) many bytes. + * Default:1, i.e. pick maxfilesize amount of data as the source of + * a compaction. + * + * @param sourceCompactionFactor the maximum number of bytes in all + * source files to be compacted in a single compaction run. + * @return the reference to the current option. + * @see setExpendedCompactionFactor() + */ + public Options setSourceCompactionFactor(int sourceCompactionFactor) { + setSourceCompactionFactor(nativeHandle_, sourceCompactionFactor); + return this; + } + private native void setSourceCompactionFactor( + long handle, int sourceCompactionFactor); + + /** + * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we + * stop building a single file in a level->level+1 compaction. + * + * @return maximum bytes of overlaps in "grandparent" level. + */ + public int maxGrandparentOverlapFactor() { + return maxGrandparentOverlapFactor(nativeHandle_); + } + private native int maxGrandparentOverlapFactor(long handle); + + /** + * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we + * stop building a single file in a level->level+1 compaction. + * + * @param maxGrandparentOverlapFactor maximum bytes of overlaps in + * "grandparent" level. + * @return the reference to the current option. + */ + public Options setMaxGrandparentOverlapFactor( + int maxGrandparentOverlapFactor) { + setMaxGrandparentOverlapFactor(nativeHandle_, maxGrandparentOverlapFactor); + return this; + } + private native void setMaxGrandparentOverlapFactor( + long handle, int maxGrandparentOverlapFactor); + + /** + * Puts are delayed 0-1 ms when any level has a compaction score that exceeds + * soft_rate_limit. This is ignored when == 0.0. + * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not + * hold, RocksDB will set soft_rate_limit = hard_rate_limit + * Default: 0 (disabled) + * + * @return soft-rate-limit for put delay. + */ + public double softRateLimit() { + return softRateLimit(nativeHandle_); + } + private native double softRateLimit(long handle); + + /** + * Puts are delayed 0-1 ms when any level has a compaction score that exceeds + * soft_rate_limit. This is ignored when == 0.0. + * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not + * hold, RocksDB will set soft_rate_limit = hard_rate_limit + * Default: 0 (disabled) + * + * @param softRateLimit the soft-rate-limit of a compaction score + * for put delay. + * @return the reference to the current option. + */ + public Options setSoftRateLimit(double softRateLimit) { + setSoftRateLimit(nativeHandle_, softRateLimit); + return this; + } + private native void setSoftRateLimit( + long handle, double softRateLimit); + + /** + * Puts are delayed 1ms at a time when any level has a compaction score that + * exceeds hard_rate_limit. This is ignored when <= 1.0. + * Default: 0 (disabled) + * + * @return the hard-rate-limit of a compaction score for put delay. + */ + public double hardRateLimit() { + return hardRateLimit(nativeHandle_); + } + private native double hardRateLimit(long handle); + + /** + * Puts are delayed 1ms at a time when any level has a compaction score that + * exceeds hard_rate_limit. This is ignored when <= 1.0. + * Default: 0 (disabled) + * + * @param hardRateLimit the hard-rate-limit of a compaction score for put + * delay. + * @return the reference to the current option. + */ + public Options setHardRateLimit(double hardRateLimit) { + setHardRateLimit(nativeHandle_, hardRateLimit); + return this; + } + private native void setHardRateLimit( + long handle, double hardRateLimit); + + /** + * The maximum time interval a put will be stalled when hard_rate_limit + * is enforced. If 0, then there is no limit. + * Default: 1000 + * + * @return the maximum time interval a put will be stalled when + * hard_rate_limit is enforced. + */ + public int rateLimitDelayMaxMilliseconds() { + return rateLimitDelayMaxMilliseconds(nativeHandle_); + } + private native int rateLimitDelayMaxMilliseconds(long handle); + + /** + * The maximum time interval a put will be stalled when hard_rate_limit + * is enforced. If 0, then there is no limit. + * Default: 1000 + * + * @param rateLimitDelayMaxMilliseconds the maximum time interval a put + * will be stalled. + * @return the reference to the current option. + */ + public Options setRateLimitDelayMaxMilliseconds( + int rateLimitDelayMaxMilliseconds) { + setRateLimitDelayMaxMilliseconds( + nativeHandle_, rateLimitDelayMaxMilliseconds); + return this; + } + private native void setRateLimitDelayMaxMilliseconds( + long handle, int rateLimitDelayMaxMilliseconds); + + /** + * Disable block cache. If this is set to true, + * then no block cache should be used, and the block_cache should + * point to a nullptr object. + * Default: false + * + * @return true if block cache is disabled. + */ + public boolean noBlockCache() { + return noBlockCache(nativeHandle_); + } + private native boolean noBlockCache(long handle); + + /** + * Disable block cache. If this is set to true, + * then no block cache should be used, and the block_cache should + * point to a nullptr object. + * Default: false + * + * @param noBlockCache true if block-cache is disabled. + * @return the reference to the current option. + */ + public Options setNoBlockCache(boolean noBlockCache) { + setNoBlockCache(nativeHandle_, noBlockCache); + return this; + } + private native void setNoBlockCache( + long handle, boolean noBlockCache); + + /** + * The size of one block in arena memory allocation. + * If <= 0, a proper value is automatically calculated (usually 1/10 of + * writer_buffer_size). + * + * There are two additonal restriction of the The specified size: + * (1) size should be in the range of [4096, 2 << 30] and + * (2) be the multiple of the CPU word (which helps with the memory + * alignment). + * + * We'll automatically check and adjust the size number to make sure it + * conforms to the restrictions. + * Default: 0 + * + * @return the size of an arena block + */ + public long arenaBlockSize() { + return arenaBlockSize(nativeHandle_); + } + private native long arenaBlockSize(long handle); + + /** + * The size of one block in arena memory allocation. + * If <= 0, a proper value is automatically calculated (usually 1/10 of + * writer_buffer_size). + * + * There are two additonal restriction of the The specified size: + * (1) size should be in the range of [4096, 2 << 30] and + * (2) be the multiple of the CPU word (which helps with the memory + * alignment). + * + * We'll automatically check and adjust the size number to make sure it + * conforms to the restrictions. + * Default: 0 + * + * @param arenaBlockSize the size of an arena block + * @return the reference to the current option. + */ + public Options setArenaBlockSize(long arenaBlockSize) { + setArenaBlockSize(nativeHandle_, arenaBlockSize); + return this; + } + private native void setArenaBlockSize( + long handle, long arenaBlockSize); + + /** + * Disable automatic compactions. Manual compactions can still + * be issued on this column family + * + * @return true if auto-compactions are disabled. + */ + public boolean disableAutoCompactions() { + return disableAutoCompactions(nativeHandle_); + } + private native boolean disableAutoCompactions(long handle); + + /** + * Disable automatic compactions. Manual compactions can still + * be issued on this column family + * + * @param disableAutoCompactions true if auto-compactions are disabled. + * @return the reference to the current option. + */ + public Options setDisableAutoCompactions(boolean disableAutoCompactions) { + setDisableAutoCompactions(nativeHandle_, disableAutoCompactions); + return this; + } + private native void setDisableAutoCompactions( + long handle, boolean disableAutoCompactions); + + /** + * Purge duplicate/deleted keys when a memtable is flushed to storage. + * Default: true + * + * @return true if purging keys is disabled. + */ + public boolean purgeRedundantKvsWhileFlush() { + return purgeRedundantKvsWhileFlush(nativeHandle_); + } + private native boolean purgeRedundantKvsWhileFlush(long handle); + + /** + * Purge duplicate/deleted keys when a memtable is flushed to storage. + * Default: true + * + * @param purgeRedundantKvsWhileFlush true if purging keys is disabled. + * @return the reference to the current option. + */ + public Options setPurgeRedundantKvsWhileFlush( + boolean purgeRedundantKvsWhileFlush) { + setPurgeRedundantKvsWhileFlush( + nativeHandle_, purgeRedundantKvsWhileFlush); + return this; + } + private native void setPurgeRedundantKvsWhileFlush( + long handle, boolean purgeRedundantKvsWhileFlush); + + /** + * This is used to close a block before it reaches the configured + * 'block_size'. If the percentage of free space in the current block is less + * than this specified number and adding a new record to the block will + * exceed the configured block size, then this block will be closed and the + * new record will be written to the next block. + * Default is 10. + * + * @return the target block size + */ + public int blockSizeDeviation() { + return blockSizeDeviation(nativeHandle_); + } + private native int blockSizeDeviation(long handle); + + /** + * This is used to close a block before it reaches the configured + * 'block_size'. If the percentage of free space in the current block is less + * than this specified number and adding a new record to the block will + * exceed the configured block size, then this block will be closed and the + * new record will be written to the next block. + * Default is 10. + * + * @param blockSizeDeviation the target block size + * @return the reference to the current option. + */ + public Options setBlockSizeDeviation(int blockSizeDeviation) { + setBlockSizeDeviation(nativeHandle_, blockSizeDeviation); + return this; + } + private native void setBlockSizeDeviation( + long handle, int blockSizeDeviation); + + /** + * If true, compaction will verify checksum on every read that happens + * as part of compaction + * Default: true + * + * @return true if compaction verifies checksum on every read. + */ + public boolean verifyChecksumsInCompaction() { + return verifyChecksumsInCompaction(nativeHandle_); + } + private native boolean verifyChecksumsInCompaction(long handle); + + /** + * If true, compaction will verify checksum on every read that happens + * as part of compaction + * Default: true + * + * @param verifyChecksumsInCompaction true if compaction verifies + * checksum on every read. + * @return the reference to the current option. + */ + public Options setVerifyChecksumsInCompaction( + boolean verifyChecksumsInCompaction) { + setVerifyChecksumsInCompaction( + nativeHandle_, verifyChecksumsInCompaction); + return this; + } + private native void setVerifyChecksumsInCompaction( + long handle, boolean verifyChecksumsInCompaction); + + /** + * Use KeyMayExist API to filter deletes when this is true. + * If KeyMayExist returns false, i.e. the key definitely does not exist, then + * the delete is a noop. KeyMayExist only incurs in-memory look up. + * This optimization avoids writing the delete to storage when appropriate. + * Default: false + * + * @return true if filter-deletes behavior is on. + */ + public boolean filterDeletes() { + return filterDeletes(nativeHandle_); + } + private native boolean filterDeletes(long handle); + + /** + * Use KeyMayExist API to filter deletes when this is true. + * If KeyMayExist returns false, i.e. the key definitely does not exist, then + * the delete is a noop. KeyMayExist only incurs in-memory look up. + * This optimization avoids writing the delete to storage when appropriate. + * Default: false + * + * @param filterDeletes true if filter-deletes behavior is on. + * @return the reference to the current option. + */ + public Options setFilterDeletes(boolean filterDeletes) { + setFilterDeletes(nativeHandle_, filterDeletes); + return this; + } + private native void setFilterDeletes( + long handle, boolean filterDeletes); + + /** + * An iteration->Next() sequentially skips over keys with the same + * user-key unless this option is set. This number specifies the number + * of keys (with the same userkey) that will be sequentially + * skipped before a reseek is issued. + * Default: 8 + * + * @return the number of keys could be skipped in a iteration. + */ + public long maxSequentialSkipInIterations() { + return maxSequentialSkipInIterations(nativeHandle_); + } + private native long maxSequentialSkipInIterations(long handle); + + /** + * An iteration->Next() sequentially skips over keys with the same + * user-key unless this option is set. This number specifies the number + * of keys (with the same userkey) that will be sequentially + * skipped before a reseek is issued. + * Default: 8 + * + * @param maxSequentialSkipInIterations the number of keys could + * be skipped in a iteration. + * @return the reference to the current option. + */ + public Options setMaxSequentialSkipInIterations(long maxSequentialSkipInIterations) { + setMaxSequentialSkipInIterations(nativeHandle_, maxSequentialSkipInIterations); + return this; + } + private native void setMaxSequentialSkipInIterations( + long handle, long maxSequentialSkipInIterations); + + /** + * Allows thread-safe inplace updates. + * If inplace_callback function is not set, + * Put(key, new_value) will update inplace the existing_value iff + * * key exists in current memtable + * * new sizeof(new_value) <= sizeof(existing_value) + * * existing_value for that key is a put i.e. kTypeValue + * If inplace_callback function is set, check doc for inplace_callback. + * Default: false. + * + * @return true if thread-safe inplace updates are allowed. + */ + public boolean inplaceUpdateSupport() { + return inplaceUpdateSupport(nativeHandle_); + } + private native boolean inplaceUpdateSupport(long handle); + + /** + * Allows thread-safe inplace updates. + * If inplace_callback function is not set, + * Put(key, new_value) will update inplace the existing_value iff + * * key exists in current memtable + * * new sizeof(new_value) <= sizeof(existing_value) + * * existing_value for that key is a put i.e. kTypeValue + * If inplace_callback function is set, check doc for inplace_callback. + * Default: false. + * + * @param inplaceUpdateSupport true if thread-safe inplace updates + * are allowed. + * @return the reference to the current option. + */ + public Options setInplaceUpdateSupport(boolean inplaceUpdateSupport) { + setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport); + return this; + } + private native void setInplaceUpdateSupport( + long handle, boolean inplaceUpdateSupport); + + /** + * Number of locks used for inplace update + * Default: 10000, if inplace_update_support = true, else 0. + * + * @return the number of locks used for inplace update. + */ + public long inplaceUpdateNumLocks() { + return inplaceUpdateNumLocks(nativeHandle_); + } + private native long inplaceUpdateNumLocks(long handle); + + /** + * Number of locks used for inplace update + * Default: 10000, if inplace_update_support = true, else 0. + * + * @param inplaceUpdateNumLocks the number of locks used for + * inplace updates. + * @return the reference to the current option. + */ + public Options setInplaceUpdateNumLocks(long inplaceUpdateNumLocks) { + setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks); + return this; + } + private native void setInplaceUpdateNumLocks( + long handle, long inplaceUpdateNumLocks); + + /** + * Returns the number of bits used in the prefix bloom filter. + * + * This value will be used only when a prefix-extractor is specified. + * + * @return the number of bloom-bits. + * @see useFixedLengthPrefixExtractor() + */ + public int memtablePrefixBloomBits() { + return memtablePrefixBloomBits(nativeHandle_); + } + private native int memtablePrefixBloomBits(long handle); + + /** + * Sets the number of bits used in the prefix bloom filter. + * + * This value will be used only when a prefix-extractor is specified. + * + * @param memtablePrefixBloomBits the number of bits used in the + * prefix bloom filter. + * @return the reference to the current option. + */ + public Options setMemtablePrefixBloomBits(int memtablePrefixBloomBits) { + setMemtablePrefixBloomBits(nativeHandle_, memtablePrefixBloomBits); + return this; + } + private native void setMemtablePrefixBloomBits( + long handle, int memtablePrefixBloomBits); + + /** + * The number of hash probes per key used in the mem-table. + * + * @return the number of hash probes per key. + */ + public int memtablePrefixBloomProbes() { + return memtablePrefixBloomProbes(nativeHandle_); + } + private native int memtablePrefixBloomProbes(long handle); + + /** + * The number of hash probes per key used in the mem-table. + * + * @param memtablePrefixBloomProbes the number of hash probes per key. + * @return the reference to the current option. + */ + public Options setMemtablePrefixBloomProbes(int memtablePrefixBloomProbes) { + setMemtablePrefixBloomProbes(nativeHandle_, memtablePrefixBloomProbes); + return this; + } + private native void setMemtablePrefixBloomProbes( + long handle, int memtablePrefixBloomProbes); + + /** + * Control locality of bloom filter probes to improve cache miss rate. + * This option only applies to memtable prefix bloom and plaintable + * prefix bloom. It essentially limits the max number of cache lines each + * bloom filter check can touch. + * This optimization is turned off when set to 0. The number should never + * be greater than number of probes. This option can boost performance + * for in-memory workload but should use with care since it can cause + * higher false positive rate. + * Default: 0 + * + * @return the level of locality of bloom-filter probes. + * @see setMemTablePrefixBloomProbes + */ + public int bloomLocality() { + return bloomLocality(nativeHandle_); + } + private native int bloomLocality(long handle); + + /** + * Control locality of bloom filter probes to improve cache miss rate. + * This option only applies to memtable prefix bloom and plaintable + * prefix bloom. It essentially limits the max number of cache lines each + * bloom filter check can touch. + * This optimization is turned off when set to 0. The number should never + * be greater than number of probes. This option can boost performance + * for in-memory workload but should use with care since it can cause + * higher false positive rate. + * Default: 0 + * + * @param bloomLocality the level of locality of bloom-filter probes. + * @return the reference to the current option. + */ + public Options setBloomLocality(int bloomLocality) { + setBloomLocality(nativeHandle_, bloomLocality); + return this; + } + private native void setBloomLocality( + long handle, int bloomLocality); + + /** + * Maximum number of successive merge operations on a key in the memtable. + * + * When a merge operation is added to the memtable and the maximum number of + * successive merges is reached, the value of the key will be calculated and + * inserted into the memtable instead of the merge operation. This will + * ensure that there are never more than max_successive_merges merge + * operations in the memtable. + * + * Default: 0 (disabled) + * + * @return the maximum number of successive merges. + */ + public long maxSuccessiveMerges() { + return maxSuccessiveMerges(nativeHandle_); + } + private native long maxSuccessiveMerges(long handle); + + /** + * Maximum number of successive merge operations on a key in the memtable. + * + * When a merge operation is added to the memtable and the maximum number of + * successive merges is reached, the value of the key will be calculated and + * inserted into the memtable instead of the merge operation. This will + * ensure that there are never more than max_successive_merges merge + * operations in the memtable. + * + * Default: 0 (disabled) + * + * @param maxSuccessiveMerges the maximum number of successive merges. + * @return the reference to the current option. + */ + public Options setMaxSuccessiveMerges(long maxSuccessiveMerges) { + setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges); + return this; + } + private native void setMaxSuccessiveMerges( + long handle, long maxSuccessiveMerges); + + /** + * The minimum number of write buffers that will be merged together + * before writing to storage. If set to 1, then + * all write buffers are fushed to L0 as individual files and this increases + * read amplification because a get request has to check in all of these + * files. Also, an in-memory merge may result in writing lesser + * data to storage if there are duplicate records in each of these + * individual write buffers. Default: 1 + * + * @return the minimum number of write buffers that will be merged together. + */ + public int minWriteBufferNumberToMerge() { + return minWriteBufferNumberToMerge(nativeHandle_); + } + private native int minWriteBufferNumberToMerge(long handle); + + /** + * The minimum number of write buffers that will be merged together + * before writing to storage. If set to 1, then + * all write buffers are fushed to L0 as individual files and this increases + * read amplification because a get request has to check in all of these + * files. Also, an in-memory merge may result in writing lesser + * data to storage if there are duplicate records in each of these + * individual write buffers. Default: 1 + * + * @param minWriteBufferNumberToMerge the minimum number of write buffers + * that will be merged together. + * @return the reference to the current option. + */ + public Options setMinWriteBufferNumberToMerge(int minWriteBufferNumberToMerge) { + setMinWriteBufferNumberToMerge(nativeHandle_, minWriteBufferNumberToMerge); + return this; + } + private native void setMinWriteBufferNumberToMerge( + long handle, int minWriteBufferNumberToMerge); + + /** + * The number of partial merge operands to accumulate before partial + * merge will be performed. Partial merge will not be called + * if the list of values to merge is less than min_partial_merge_operands. + * + * If min_partial_merge_operands < 2, then it will be treated as 2. + * + * Default: 2 + * + * @return + */ + public int minPartialMergeOperands() { + return minPartialMergeOperands(nativeHandle_); + } + private native int minPartialMergeOperands(long handle); + + /** + * The number of partial merge operands to accumulate before partial + * merge will be performed. Partial merge will not be called + * if the list of values to merge is less than min_partial_merge_operands. + * + * If min_partial_merge_operands < 2, then it will be treated as 2. + * + * Default: 2 + * + * @param minPartialMergeOperands + * @return the reference to the current option. + */ + public Options setMinPartialMergeOperands(int minPartialMergeOperands) { + setMinPartialMergeOperands(nativeHandle_, minPartialMergeOperands); + return this; + } + private native void setMinPartialMergeOperands( + long handle, int minPartialMergeOperands); + /** * Release the memory allocated for the current instance * in the c++ side. @@ -1250,8 +2358,7 @@ public class Options { private native void useFixedLengthPrefixExtractor( long handle, int prefixLength); - private native void setFilter0(long optHandle, Filter fp); - long nativeHandle_; long cacheSize_; + Filter filter_; } diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java index 649433b8c..728798ade 100644 --- a/java/org/rocksdb/RocksDB.java +++ b/java/org/rocksdb/RocksDB.java @@ -39,6 +39,7 @@ public class RocksDB { // the c++ one. Options options = new Options(); db.open(options.nativeHandle_, options.cacheSize_, path); + db.transferCppRawPointersOwnershipFrom(options); options.dispose(); return db; } @@ -49,8 +50,12 @@ public class RocksDB { */ public static RocksDB open(Options options, String path) throws RocksDBException { + // when non-default Options is used, keeping an Options reference + // in RocksDB can prevent Java to GC during the life-time of + // the currently-created RocksDB. RocksDB db = new RocksDB(); db.open(options.nativeHandle_, options.cacheSize_, path); + db.transferCppRawPointersOwnershipFrom(options); return db; } @@ -253,6 +258,17 @@ public class RocksDB { nativeHandle_ = 0; } + /** + * Transfer the ownership of all c++ raw-pointers from Options + * to RocksDB to ensure the life-time of those raw-pointers + * will be at least as long as the life-time of any RocksDB + * that uses these raw-pointers. + */ + protected void transferCppRawPointersOwnershipFrom(Options opt) { + filter_ = opt.filter_; + opt.filter_ = null; + } + // native methods protected native void open( long optionsHandle, long cacheSize, String path) throws RocksDBException; @@ -289,4 +305,5 @@ public class RocksDB { protected native void close0(); protected long nativeHandle_; + protected Filter filter_; } diff --git a/java/org/rocksdb/benchmark/DbBenchmark.java b/java/org/rocksdb/benchmark/DbBenchmark.java index 0106413cf..37b08bc15 100644 --- a/java/org/rocksdb/benchmark/DbBenchmark.java +++ b/java/org/rocksdb/benchmark/DbBenchmark.java @@ -54,6 +54,10 @@ class Stats { StringBuilder message_; boolean excludeFromMerge_; + // TODO(yhchiang): use the following arguments: + // (Long)Flag.stats_interval + // (Integer)Flag.stats_per_interval + Stats(int id) { id_ = id; nextReport_ = 100; @@ -163,6 +167,7 @@ public class DbBenchmark { } abstract class BenchmarkTask implements Callable { + // TODO(yhchiang): use (Integer)Flag.perf_level. public BenchmarkTask( int tid, long randSeed, long numEntries, long keyRange) { tid_ = tid; @@ -311,13 +316,73 @@ public class DbBenchmark { } } + class WriteUniqueRandomTask extends WriteTask { + static final int MAX_BUFFER_SIZE = 10000000; + public WriteUniqueRandomTask( + int tid, long randSeed, long numEntries, long keyRange, + WriteOptions writeOpt, long entriesPerBatch) { + super(tid, randSeed, numEntries, keyRange, + writeOpt, entriesPerBatch); + initRandomKeySequence(); + } + public WriteUniqueRandomTask( + int tid, long randSeed, long numEntries, long keyRange, + WriteOptions writeOpt, long entriesPerBatch, + long maxWritesPerSecond) { + super(tid, randSeed, numEntries, keyRange, + writeOpt, entriesPerBatch, + maxWritesPerSecond); + initRandomKeySequence(); + } + @Override protected void getKey(byte[] key, long id, long range) { + generateKeyFromLong(key, nextUniqueRandom()); + } + + protected void initRandomKeySequence() { + bufferSize_ = MAX_BUFFER_SIZE; + if (bufferSize_ > keyRange_) { + bufferSize_ = (int) keyRange_; + } + currentKeyCount_ = bufferSize_; + keyBuffer_ = new long[MAX_BUFFER_SIZE]; + for (int k = 0; k < bufferSize_; ++k) { + keyBuffer_[k] = k; + } + } + + /** + * Semi-randomly return the next unique key. It is guaranteed to be + * fully random if keyRange_ <= MAX_BUFFER_SIZE. + */ + long nextUniqueRandom() { + if (bufferSize_ == 0) { + System.err.println("bufferSize_ == 0."); + return 0; + } + int r = rand_.nextInt(bufferSize_); + // randomly pick one from the keyBuffer + long randKey = keyBuffer_[r]; + if (currentKeyCount_ < keyRange_) { + // if we have not yet inserted all keys, insert next new key to [r]. + keyBuffer_[r] = currentKeyCount_++; + } else { + // move the last element to [r] and decrease the size by 1. + keyBuffer_[r] = keyBuffer_[--bufferSize_]; + } + return randKey; + } + + int bufferSize_; + long currentKeyCount_; + long[] keyBuffer_; + } + class ReadRandomTask extends BenchmarkTask { public ReadRandomTask( int tid, long randSeed, long numEntries, long keyRange) { super(tid, randSeed, numEntries, keyRange); } @Override public void runTask() throws RocksDBException { - stats_.found_ = 0; byte[] key = new byte[keySize_]; byte[] value = new byte[valueSize_]; for (long i = 0; i < numEntries_; i++) { @@ -338,18 +403,22 @@ public class DbBenchmark { class ReadSequentialTask extends BenchmarkTask { public ReadSequentialTask( - int tid, long randSeed, long numEntries, long keyRange, long initId) { + int tid, long randSeed, long numEntries, long keyRange) { super(tid, randSeed, numEntries, keyRange); - initId_ = initId; } @Override public void runTask() throws RocksDBException { - // make sure we have enough things to read in sequential - if (numEntries_ > keyRange_ - initId_) { - numEntries_ = keyRange_ - initId_; + org.rocksdb.Iterator iter = db_.newIterator(); + long i; + for (iter.seekToFirst(), i = 0; + iter.isValid() && i < numEntries_; + iter.next(), ++i) { + stats_.found_++; + stats_.finishedSingleOp(iter.key().length + iter.value().length); + if (isFinished()) { + return; + } } - throw new UnsupportedOperationException(); } - private long initId_; } public DbBenchmark(Map flags) throws Exception { @@ -360,22 +429,33 @@ public class DbBenchmark { flags.get(Flag.num) : flags.get(Flag.reads)); keySize_ = (Integer) flags.get(Flag.key_size); valueSize_ = (Integer) flags.get(Flag.value_size); - writeBufferSize_ = (Integer) flags.get(Flag.write_buffer_size) > 0 ? - (Integer) flags.get(Flag.write_buffer_size) : 0; compressionRatio_ = (Double) flags.get(Flag.compression_ratio); useExisting_ = (Boolean) flags.get(Flag.use_existing_db); randSeed_ = (Long) flags.get(Flag.seed); databaseDir_ = (String) flags.get(Flag.db); writesPerSeconds_ = (Integer) flags.get(Flag.writes_per_second); cacheSize_ = (Long) flags.get(Flag.cache_size); - gen_ = new RandomGenerator(compressionRatio_); + gen_ = new RandomGenerator(randSeed_, compressionRatio_); memtable_ = (String) flags.get(Flag.memtablerep); maxWriteBufferNumber_ = (Integer) flags.get(Flag.max_write_buffer_number); prefixSize_ = (Integer) flags.get(Flag.prefix_size); keysPerPrefix_ = (Integer) flags.get(Flag.keys_per_prefix); hashBucketCount_ = (Long) flags.get(Flag.hash_bucket_count); usePlainTable_ = (Boolean) flags.get(Flag.use_plain_table); + flags_ = flags; finishLock_ = new Object(); + // options.setPrefixSize((Integer)flags_.get(Flag.prefix_size)); + // options.setKeysPerPrefix((Long)flags_.get(Flag.keys_per_prefix)); + } + + private void prepareReadOptions(ReadOptions options) { + options.setVerifyChecksums((Boolean)flags_.get(Flag.verify_checksum)); + options.setTailing((Boolean)flags_.get(Flag.use_tailing_iterator)); + } + + private void prepareWriteOptions(WriteOptions options) { + options.setSync((Boolean)flags_.get(Flag.sync)); + options.setDisableWAL((Boolean)flags_.get(Flag.disable_wal)); } private void prepareOptions(Options options) { @@ -405,9 +485,119 @@ public class DbBenchmark { options.memTableFactoryName()); } if (usePlainTable_) { - options.setSstFormatConfig( + options.setTableFormatConfig( new PlainTableConfig().setKeySize(keySize_)); } + options.setMaxWriteBufferNumber( + (Integer)flags_.get(Flag.max_write_buffer_number)); + options.setMaxBackgroundCompactions( + (Integer)flags_.get(Flag.max_background_compactions)); + options.setMaxBackgroundFlushes( + (Integer)flags_.get(Flag.max_background_flushes)); + options.setCacheSize( + (Long)flags_.get(Flag.cache_size)); + options.setBlockSize( + (Long)flags_.get(Flag.block_size)); + options.setMaxOpenFiles( + (Integer)flags_.get(Flag.open_files)); + options.setCreateIfMissing( + !(Boolean)flags_.get(Flag.use_existing_db)); + options.setTableCacheRemoveScanCountLimit( + (Integer)flags_.get(Flag.cache_remove_scan_count_limit)); + options.setDisableDataSync( + (Boolean)flags_.get(Flag.disable_data_sync)); + options.setUseFsync( + (Boolean)flags_.get(Flag.use_fsync)); + options.setWalDir( + (String)flags_.get(Flag.wal_dir)); + options.setDisableSeekCompaction( + (Boolean)flags_.get(Flag.disable_seek_compaction)); + options.setDeleteObsoleteFilesPeriodMicros( + (Long)flags_.get(Flag.delete_obsolete_files_period_micros)); + options.setTableCacheNumshardbits( + (Integer)flags_.get(Flag.table_cache_numshardbits)); + options.setAllowMmapReads( + (Boolean)flags_.get(Flag.mmap_read)); + options.setAllowMmapWrites( + (Boolean)flags_.get(Flag.mmap_write)); + options.setAdviseRandomOnOpen( + (Boolean)flags_.get(Flag.advise_random_on_open)); + options.setUseAdaptiveMutex( + (Boolean)flags_.get(Flag.use_adaptive_mutex)); + options.setBytesPerSync( + (Long)flags_.get(Flag.bytes_per_sync)); + options.setBloomLocality( + (Integer)flags_.get(Flag.bloom_locality)); + options.setMinWriteBufferNumberToMerge( + (Integer)flags_.get(Flag.min_write_buffer_number_to_merge)); + options.setMemtablePrefixBloomBits( + (Integer)flags_.get(Flag.memtable_bloom_bits)); + options.setNumLevels( + (Integer)flags_.get(Flag.num_levels)); + options.setTargetFileSizeBase( + (Integer)flags_.get(Flag.target_file_size_base)); + options.setTargetFileSizeMultiplier( + (Integer)flags_.get(Flag.target_file_size_multiplier)); + options.setMaxBytesForLevelBase( + (Integer)flags_.get(Flag.max_bytes_for_level_base)); + options.setMaxBytesForLevelMultiplier( + (Integer)flags_.get(Flag.max_bytes_for_level_multiplier)); + options.setLevelZeroStopWritesTrigger( + (Integer)flags_.get(Flag.level0_stop_writes_trigger)); + options.setLevelZeroSlowdownWritesTrigger( + (Integer)flags_.get(Flag.level0_slowdown_writes_trigger)); + options.setLevelZeroFileNumCompactionTrigger( + (Integer)flags_.get(Flag.level0_file_num_compaction_trigger)); + options.setSoftRateLimit( + (Double)flags_.get(Flag.soft_rate_limit)); + options.setHardRateLimit( + (Double)flags_.get(Flag.hard_rate_limit)); + options.setRateLimitDelayMaxMilliseconds( + (Integer)flags_.get(Flag.rate_limit_delay_max_milliseconds)); + options.setMaxGrandparentOverlapFactor( + (Integer)flags_.get(Flag.max_grandparent_overlap_factor)); + options.setDisableAutoCompactions( + (Boolean)flags_.get(Flag.disable_auto_compactions)); + options.setSourceCompactionFactor( + (Integer)flags_.get(Flag.source_compaction_factor)); + options.setFilterDeletes( + (Boolean)flags_.get(Flag.filter_deletes)); + options.setMaxSuccessiveMerges( + (Integer)flags_.get(Flag.max_successive_merges)); + options.setWalTtlSeconds((Long)flags_.get(Flag.wal_ttl_seconds)); + options.setWalSizeLimitMB((Long)flags_.get(Flag.wal_size_limit_MB)); + int bloomBits = (Integer)flags_.get(Flag.bloom_bits); + if (bloomBits > 0) { + // Internally, options will keep a reference to this BloomFilter. + // This will disallow Java to GC this BloomFilter. In addition, + // options.dispose() will release the c++ object of this BloomFilter. + // As a result, the caller should not directly call + // BloomFilter.dispose(). + options.setFilter(new BloomFilter(bloomBits)); + } + /* TODO(yhchiang): enable the following parameters + options.setCompressionType((String)flags_.get(Flag.compression_type)); + options.setCompressionLevel((Integer)flags_.get(Flag.compression_level)); + options.setMinLevelToCompress((Integer)flags_.get(Flag.min_level_to_compress)); + options.setHdfs((String)flags_.get(Flag.hdfs)); // env + options.setCacheNumshardbits((Integer)flags_.get(Flag.cache_numshardbits)); + options.setStatistics((Boolean)flags_.get(Flag.statistics)); + options.setUniversalSizeRatio( + (Integer)flags_.get(Flag.universal_size_ratio)); + options.setUniversalMinMergeWidth( + (Integer)flags_.get(Flag.universal_min_merge_width)); + options.setUniversalMaxMergeWidth( + (Integer)flags_.get(Flag.universal_max_merge_width)); + options.setUniversalMaxSizeAmplificationPercent( + (Integer)flags_.get(Flag.universal_max_size_amplification_percent)); + options.setUniversalCompressionSizePercent( + (Integer)flags_.get(Flag.universal_compression_size_percent)); + // TODO(yhchiang): add RocksDB.openForReadOnly() to enable Flag.readonly + // TODO(yhchiang): enable Flag.merge_operator by switch + options.setAccessHintOnCompactionStart( + (String)flags_.get(Flag.compaction_fadvice)); + // available values of fadvice are "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" for fadvice + */ } private void run() throws RocksDBException { @@ -424,6 +614,9 @@ public class DbBenchmark { List> tasks = new ArrayList>(); List> bgTasks = new ArrayList>(); WriteOptions writeOpt = new WriteOptions(); + prepareWriteOptions(writeOpt); + ReadOptions readOpt = new ReadOptions(); + prepareReadOptions(readOpt); int currentTaskId = 0; boolean known = true; @@ -436,6 +629,9 @@ public class DbBenchmark { } else if (benchmark.equals("fillrandom")) { tasks.add(new WriteRandomTask( currentTaskId++, randSeed_, num_, num_, writeOpt, 1)); + } else if (benchmark.equals("filluniquerandom")) { + tasks.add(new WriteUniqueRandomTask( + currentTaskId++, randSeed_, num_, num_, writeOpt, 1)); } else if (benchmark.equals("fillsync")) { writeOpt.setSync(true); tasks.add(new WriteRandomTask( @@ -444,13 +640,12 @@ public class DbBenchmark { } else if (benchmark.equals("readseq")) { for (int t = 0; t < threadNum_; ++t) { tasks.add(new ReadSequentialTask( - currentTaskId++, randSeed_, reads_ / threadNum_, - num_, (num_ / threadNum_) * t)); + currentTaskId++, randSeed_, reads_, num_)); } } else if (benchmark.equals("readrandom")) { for (int t = 0; t < threadNum_; ++t) { tasks.add(new ReadRandomTask( - currentTaskId++, randSeed_, reads_ / threadNum_, num_)); + currentTaskId++, randSeed_, reads_, num_)); } } else if (benchmark.equals("readwhilewriting")) { WriteTask writeTask = new WriteRandomTask( @@ -508,6 +703,7 @@ public class DbBenchmark { } } writeOpt.dispose(); + readOpt.dispose(); } options.dispose(); db_.close(); @@ -573,7 +769,7 @@ public class DbBenchmark { System.out.printf( "%-16s : %11.5f micros/op; %6.1f MB/s; %d / %d task(s) finished.\n", - benchmark, elapsedSeconds * 1e6 / num_, + benchmark, elapsedSeconds * 1e6 / stats.done_, (stats.bytes_ / 1048576.0) / elapsedSeconds, taskFinishedCount, concurrentThreads); } @@ -616,14 +812,13 @@ public class DbBenchmark { static void printHelp() { System.out.println("usage:"); for (Flag flag : Flag.values()) { - System.out.format(" --%s%n %s%n", + System.out.format(" --%s%n\t%s%n", flag.name(), flag.desc()); if (flag.getDefaultValue() != null) { - System.out.format(" DEFAULT: %s%n", + System.out.format("\tDEFAULT: %s%n", flag.getDefaultValue().toString()); } - System.out.println(""); } } @@ -677,30 +872,28 @@ public class DbBenchmark { "\t\tfillseq -- write N values in sequential key order in async mode.\n" + "\t\tfillrandom -- write N values in random key order in async mode.\n" + "\t\tfillbatch -- write N/1000 batch where each batch has 1000 values\n" + - "\t\t in random key order in sync mode.\n" + + "\t\t in random key order in sync mode.\n" + "\t\tfillsync -- write N/100 values in random key order in sync mode.\n" + "\t\tfill100K -- write N/1000 100K values in random order in async mode.\n" + "\t\treadseq -- read N times sequentially.\n" + "\t\treadrandom -- read N times in random order.\n" + "\t\treadhot -- read N times in random order from 1% section of DB.\n" + "\t\treadwhilewriting -- measure the read performance of multiple readers\n" + - "\t\t with a bg single writer. The write rate of the bg\n" + - "\t\t is capped by --writes_per_second.\n" + + "\t\t with a bg single writer. The write rate of the bg\n" + + "\t\t is capped by --writes_per_second.\n" + "\tMeta Operations:\n" + "\t\tdelete -- delete DB") { @Override public Object parseValue(String value) { return new ArrayList(Arrays.asList(value.split(","))); } }, - compression_ratio(0.5d, "Arrange to generate values that shrink to this fraction of\n" + - "\ttheir original size after compression") { + "\ttheir original size after compression.") { @Override public Object parseValue(String value) { return Double.parseDouble(value); } }, - use_existing_db(false, "If true, do not destroy the existing database. If you set this\n" + "\tflag and also specify a benchmark that wants a fresh database,\n" + @@ -709,51 +902,43 @@ public class DbBenchmark { return Boolean.parseBoolean(value); } }, - num(1000000, "Number of key/values to place in database.") { @Override public Object parseValue(String value) { return Integer.parseInt(value); } }, - threads(1, "Number of concurrent threads to run.") { @Override public Object parseValue(String value) { return Integer.parseInt(value); } }, - reads(null, "Number of read operations to do. If negative, do --nums reads.") { - @Override - public Object parseValue(String value) { + @Override public Object parseValue(String value) { return Integer.parseInt(value); } }, - key_size(16, "The size of each key in bytes.") { @Override public Object parseValue(String value) { return Integer.parseInt(value); } }, - value_size(100, "The size of each value in bytes.") { @Override public Object parseValue(String value) { return Integer.parseInt(value); } }, - write_buffer_size(4 << 20, "Number of bytes to buffer in memtable before compacting\n" + "\t(initialized to default value by 'main'.)") { @Override public Object parseValue(String value) { - return Integer.parseInt(value); + return Long.parseLong(value); } }, - max_write_buffer_number(2, "The number of in-memory memtables. Each memtable is of size\n" + "\twrite_buffer_size.") { @@ -761,14 +946,12 @@ public class DbBenchmark { return Integer.parseInt(value); } }, - prefix_size(0, "Controls the prefix size for HashSkipList, HashLinkedList,\n" + "\tand plain table.") { @Override public Object parseValue(String value) { return Integer.parseInt(value); } }, - keys_per_prefix(0, "Controls the average number of keys generated\n" + "\tper prefix, 0 means no special handling of the prefix,\n" + "\ti.e. use the prefix comes with the generated random number.") { @@ -776,7 +959,6 @@ public class DbBenchmark { return Integer.parseInt(value); } }, - memtablerep("skip_list", "The memtable format. Available options are\n" + "\tskip_list,\n" + @@ -787,7 +969,6 @@ public class DbBenchmark { return value; } }, - hash_bucket_count(SizeUnit.MB, "The number of hash buckets used in the hash-bucket-based\n" + "\tmemtables. Memtables that currently support this argument are\n" + @@ -796,7 +977,6 @@ public class DbBenchmark { return Long.parseLong(value); } }, - writes_per_second(10000, "The write-rate of the background writer used in the\n" + "\t`readwhilewriting` benchmark. Non-positive number indicates\n" + @@ -805,14 +985,12 @@ public class DbBenchmark { return Integer.parseInt(value); } }, - use_plain_table(false, "Use plain-table sst format.") { @Override public Object parseValue(String value) { return Boolean.parseBoolean(value); } }, - cache_size(-1L, "Number of bytes to use as a cache of uncompressed data.\n" + "\tNegative means use default settings.") { @@ -820,15 +998,445 @@ public class DbBenchmark { return Long.parseLong(value); } }, - seed(0L, "Seed base for random number generators.") { @Override public Object parseValue(String value) { return Long.parseLong(value); } }, - - + num_levels(7, + "The total number of levels.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + numdistinct(1000, + "Number of distinct keys to use. Used in RandomWithVerify to\n" + + "\tread/write on fewer keys so that gets are more likely to find the\n" + + "\tkey and puts are more likely to update the same key.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + merge_keys(-1, + "Number of distinct keys to use for MergeRandom and\n" + + "\tReadRandomMergeRandom.\n" + + "\tIf negative, there will be FLAGS_num keys.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + bloom_locality(0,"Control bloom filter probes locality.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + duration(0,"Time in seconds for the random-ops tests to run.\n" + + "\tWhen 0 then num & reads determine the test duration.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + num_multi_db(0, + "Number of DBs used in the benchmark. 0 means single DB.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + histogram(false,"Print histogram of operation timings.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + min_write_buffer_number_to_merge( + defaultOptions_.minWriteBufferNumberToMerge(), + "The minimum number of write buffers that will be merged together\n" + + "\tbefore writing to storage. This is cheap because it is an\n" + + "\tin-memory merge. If this feature is not enabled, then all these\n" + + "\twrite buffers are flushed to L0 as separate files and this\n" + + "\tincreases read amplification because a get request has to check\n" + + "\tin all of these files. Also, an in-memory merge may result in\n" + + "\twriting less data to storage if there are duplicate records\n" + + "\tin each of these individual write buffers.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + max_background_compactions( + defaultOptions_.maxBackgroundCompactions(), + "The maximum number of concurrent background compactions\n" + + "\tthat can occur in parallel.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + max_background_flushes( + defaultOptions_.maxBackgroundFlushes(), + "The maximum number of concurrent background flushes\n" + + "\tthat can occur in parallel.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + /* TODO(yhchiang): enable the following + compaction_style((int32_t) defaultOptions_.compactionStyle(), + "style of compaction: level-based vs universal.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + },*/ + universal_size_ratio(0, + "Percentage flexibility while comparing file size\n" + + "\t(for universal compaction only).") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + universal_min_merge_width(0,"The minimum number of files in a\n" + + "\tsingle compaction run (for universal compaction only).") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + universal_max_merge_width(0,"The max number of files to compact\n" + + "\tin universal style compaction.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + universal_max_size_amplification_percent(0, + "The max size amplification for universal style compaction.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + universal_compression_size_percent(-1, + "The percentage of the database to compress for universal\n" + + "\tcompaction. -1 means compress everything.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + block_size(defaultOptions_.blockSize(), + "Number of bytes in a block.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + compressed_cache_size(-1, + "Number of bytes to use as a cache of compressed data.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + open_files(defaultOptions_.maxOpenFiles(), + "Maximum number of files to keep open at the same time\n" + + "\t(use default if == 0)") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + bloom_bits(-1,"Bloom filter bits per key. Negative means\n" + + "\tuse default settings.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + memtable_bloom_bits(0,"Bloom filter bits per key for memtable.\n" + + "\tNegative means no bloom filter.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + cache_numshardbits(-1,"Number of shards for the block cache\n" + + "\tis 2 ** cache_numshardbits. Negative means use default settings.\n" + + "\tThis is applied only if FLAGS_cache_size is non-negative.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + cache_remove_scan_count_limit(32,"") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + verify_checksum(false,"Verify checksum for every block read\n" + + "\tfrom storage.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + statistics(false,"Database statistics.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + writes(-1,"Number of write operations to do. If negative, do\n" + + "\t--num reads.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + sync(false,"Sync all writes to disk.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + disable_data_sync(false,"If true, do not wait until data is\n" + + "\tsynced to disk.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + use_fsync(false,"If true, issue fsync instead of fdatasync.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + disable_wal(false,"If true, do not write WAL for write.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + wal_dir("", "If not empty, use the given dir for WAL.") { + @Override public Object parseValue(String value) { + return value; + } + }, + target_file_size_base(2 * 1048576,"Target file size at level-1") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + target_file_size_multiplier(1, + "A multiplier to compute target level-N file size (N >= 2)") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + max_bytes_for_level_base(10 * 1048576, + "Max bytes for level-1") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + max_bytes_for_level_multiplier(10, + "A multiplier to compute max bytes for level-N (N >= 2)") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + level0_stop_writes_trigger(12,"Number of files in level-0\n" + + "\tthat will trigger put stop.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + level0_slowdown_writes_trigger(8,"Number of files in level-0\n" + + "\tthat will slow down writes.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + level0_file_num_compaction_trigger(4,"Number of files in level-0\n" + + "\twhen compactions start.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + readwritepercent(90,"Ratio of reads to reads/writes (expressed\n" + + "\tas percentage) for the ReadRandomWriteRandom workload. The\n" + + "\tdefault value 90 means 90% operations out of all reads and writes\n" + + "\toperations are reads. In other words, 9 gets for every 1 put.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + mergereadpercent(70,"Ratio of merges to merges&reads (expressed\n" + + "\tas percentage) for the ReadRandomMergeRandom workload. The\n" + + "\tdefault value 70 means 70% out of all read and merge operations\n" + + "\tare merges. In other words, 7 merges for every 3 gets.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + deletepercent(2,"Percentage of deletes out of reads/writes/\n" + + "\tdeletes (used in RandomWithVerify only). RandomWithVerify\n" + + "\tcalculates writepercent as (100 - FLAGS_readwritepercent -\n" + + "\tdeletepercent), so deletepercent must be smaller than (100 -\n" + + "\tFLAGS_readwritepercent)") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + disable_seek_compaction(false,"Option to disable compaction\n" + + "\ttriggered by read.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + delete_obsolete_files_period_micros(0L,"Option to delete\n" + + "\tobsolete files periodically. 0 means that obsolete files are\n" + + "\tdeleted after every compaction run.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + compression_level(-1, + "Compression level. For zlib this should be -1 for the\n" + + "\tdefault level, or between 0 and 9.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + min_level_to_compress(-1,"If non-negative, compression starts\n" + + "\tfrom this level. Levels with number < min_level_to_compress are\n" + + "\tnot compressed. Otherwise, apply compression_type to\n" + + "\tall levels.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + table_cache_numshardbits(4,"") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + stats_interval(0,"Stats are reported every N operations when\n" + + "\tthis is greater than zero. When 0 the interval grows over time.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + stats_per_interval(0,"Reports additional stats per interval when\n" + + "\tthis is greater than 0.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + perf_level(0,"Level of perf collection.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + soft_rate_limit(0.0,"") { + @Override public Object parseValue(String value) { + return Double.parseDouble(value); + } + }, + hard_rate_limit(0.0,"When not equal to 0 this make threads\n" + + "\tsleep at each stats reporting interval until the compaction\n" + + "\tscore for all levels is less than or equal to this value.") { + @Override public Object parseValue(String value) { + return Double.parseDouble(value); + } + }, + rate_limit_delay_max_milliseconds(1000, + "When hard_rate_limit is set then this is the max time a put will\n" + + "\tbe stalled.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + max_grandparent_overlap_factor(10,"Control maximum bytes of\n" + + "\toverlaps in grandparent (i.e., level+2) before we stop building a\n" + + "\tsingle file in a level->level+1 compaction.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + readonly(false,"Run read only benchmarks.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + disable_auto_compactions(false,"Do not auto trigger compactions.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + source_compaction_factor(1,"Cap the size of data in level-K for\n" + + "\ta compaction run that compacts Level-K with Level-(K+1) (for\n" + + "\tK >= 1)") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + wal_ttl_seconds(0L,"Set the TTL for the WAL Files in seconds.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + wal_size_limit_MB(0L,"Set the size limit for the WAL Files\n" + + "\tin MB.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + /* TODO(yhchiang): enable the following + bufferedio(rocksdb::EnvOptions().use_os_buffer, + "Allow buffered io using OS buffers.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + */ + mmap_read(false, + "Allow reads to occur via mmap-ing files.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + mmap_write(false, + "Allow writes to occur via mmap-ing files.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + advise_random_on_open(defaultOptions_.adviseRandomOnOpen(), + "Advise random access on table file open.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + compaction_fadvice("NORMAL", + "Access pattern advice when a file is compacted.") { + @Override public Object parseValue(String value) { + return value; + } + }, + use_tailing_iterator(false, + "Use tailing iterator to access a series of keys instead of get.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + use_adaptive_mutex(defaultOptions_.useAdaptiveMutex(), + "Use adaptive mutex.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + bytes_per_sync(defaultOptions_.bytesPerSync(), + "Allows OS to incrementally sync files to disk while they are\n" + + "\tbeing written, in the background. Issue one request for every\n" + + "\tbytes_per_sync written. 0 turns it off.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + filter_deletes(false," On true, deletes use bloom-filter and drop\n" + + "\tthe delete if key not present.") { + @Override public Object parseValue(String value) { + return Boolean.parseBoolean(value); + } + }, + max_successive_merges(0,"Maximum number of successive merge\n" + + "\toperations on a key in the memtable.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, db("/tmp/rocksdbjni-bench", "Use the db with the following name.") { @Override public Object parseValue(String value) { @@ -859,25 +1467,23 @@ public class DbBenchmark { private final byte[] data_; private int dataLength_; private int position_; + Random rand_; - private RandomGenerator(double compressionRatio) { + private RandomGenerator(long seed, double compressionRatio) { // We use a limited amount of data over and over again and ensure // that it is larger than the compression window (32KB), and also // large enough to serve all typical value sizes we want to write. - Random rand = new Random(301); + rand_ = new Random(seed); dataLength_ = 1048576 + 100; data_ = new byte[dataLength_]; // TODO(yhchiang): mimic test::CompressibleString? for (int i = 0; i < dataLength_; ++i) { - data_[i] = (byte) (' ' + rand.nextInt(95)); + data_[i] = (byte) (' ' + rand_.nextInt(95)); } } private byte[] generate(int length) { - if (position_ + length > data_.length) { - position_ = 0; - assert (length < data_.length); - } + position_ = rand_.nextInt(data_.length - length); return Arrays.copyOfRange(data_, position_, position_ + length); } } @@ -911,7 +1517,6 @@ public class DbBenchmark { long startTime_; // memtable related - final int writeBufferSize_; final int maxWriteBufferNumber_; final int prefixSize_; final int keysPerPrefix_; @@ -923,4 +1528,8 @@ public class DbBenchmark { Object finishLock_; boolean isFinished_; + Map flags_; + // as the scope of a static member equals to the scope of the problem, + // we let its c++ pointer to be disposed in its finalizer. + static Options defaultOptions_ = new Options(); } diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java index 7c13db6d1..cd3ba785d 100644 --- a/java/org/rocksdb/test/OptionsTest.java +++ b/java/org/rocksdb/test/OptionsTest.java @@ -123,9 +123,9 @@ public class OptionsTest { assert(opt.tableCacheRemoveScanCountLimit() == intValue); } - { // WALTtlSeconds test + { // WalTtlSeconds test long longValue = rand.nextLong(); - opt.setWALTtlSeconds(longValue); + opt.setWalTtlSeconds(longValue); assert(opt.walTtlSeconds() == longValue); } @@ -195,6 +195,228 @@ public class OptionsTest { assert(opt.allowThreadLocal() == boolValue); } + { // WriteBufferSize test + long longValue = rand.nextLong(); + opt.setWriteBufferSize(longValue); + assert(opt.writeBufferSize() == longValue); + } + + { // MaxWriteBufferNumber test + int intValue = rand.nextInt(); + opt.setMaxWriteBufferNumber(intValue); + assert(opt.maxWriteBufferNumber() == intValue); + } + + { // MinWriteBufferNumberToMerge test + int intValue = rand.nextInt(); + opt.setMinWriteBufferNumberToMerge(intValue); + assert(opt.minWriteBufferNumberToMerge() == intValue); + } + + { // BlockSize test + long longValue = rand.nextLong(); + opt.setBlockSize(longValue); + assert(opt.blockSize() == longValue); + } + + { // BlockRestartInterval test + int intValue = rand.nextInt(); + opt.setBlockRestartInterval(intValue); + assert(opt.blockRestartInterval() == intValue); + } + + { // WholeKeyFiltering test + boolean boolValue = rand.nextBoolean(); + opt.setWholeKeyFiltering(boolValue); + assert(opt.wholeKeyFiltering() == boolValue); + } + + { // NumLevels test + int intValue = rand.nextInt(); + opt.setNumLevels(intValue); + assert(opt.numLevels() == intValue); + } + + { // LevelFileNumCompactionTrigger test + int intValue = rand.nextInt(); + opt.setLevelZeroFileNumCompactionTrigger(intValue); + assert(opt.levelZeroFileNumCompactionTrigger() == intValue); + } + + { // LevelSlowdownWritesTrigger test + int intValue = rand.nextInt(); + opt.setLevelZeroSlowdownWritesTrigger(intValue); + assert(opt.levelZeroSlowdownWritesTrigger() == intValue); + } + + { // LevelStopWritesTrigger test + int intValue = rand.nextInt(); + opt.setLevelZeroStopWritesTrigger(intValue); + assert(opt.levelZeroStopWritesTrigger() == intValue); + } + + { // MaxMemCompactionLevel test + int intValue = rand.nextInt(); + opt.setMaxMemCompactionLevel(intValue); + assert(opt.maxMemCompactionLevel() == intValue); + } + + { // TargetFileSizeBase test + int intValue = rand.nextInt(); + opt.setTargetFileSizeBase(intValue); + assert(opt.targetFileSizeBase() == intValue); + } + + { // TargetFileSizeMultiplier test + int intValue = rand.nextInt(); + opt.setTargetFileSizeMultiplier(intValue); + assert(opt.targetFileSizeMultiplier() == intValue); + } + + { // MaxBytesForLevelBase test + long longValue = rand.nextLong(); + opt.setMaxBytesForLevelBase(longValue); + assert(opt.maxBytesForLevelBase() == longValue); + } + + { // MaxBytesForLevelMultiplier test + int intValue = rand.nextInt(); + opt.setMaxBytesForLevelMultiplier(intValue); + assert(opt.maxBytesForLevelMultiplier() == intValue); + } + + { // ExpandedCompactionFactor test + int intValue = rand.nextInt(); + opt.setExpandedCompactionFactor(intValue); + assert(opt.expandedCompactionFactor() == intValue); + } + + { // SourceCompactionFactor test + int intValue = rand.nextInt(); + opt.setSourceCompactionFactor(intValue); + assert(opt.sourceCompactionFactor() == intValue); + } + + { // MaxGrandparentOverlapFactor test + int intValue = rand.nextInt(); + opt.setMaxGrandparentOverlapFactor(intValue); + assert(opt.maxGrandparentOverlapFactor() == intValue); + } + + { // DisableSeekCompaction test + boolean boolValue = rand.nextBoolean(); + opt.setDisableSeekCompaction(boolValue); + assert(opt.disableSeekCompaction() == boolValue); + } + + { // SoftRateLimit test + double doubleValue = rand.nextDouble(); + opt.setSoftRateLimit(doubleValue); + assert(opt.softRateLimit() == doubleValue); + } + + { // HardRateLimit test + double doubleValue = rand.nextDouble(); + opt.setHardRateLimit(doubleValue); + assert(opt.hardRateLimit() == doubleValue); + } + + { // RateLimitDelayMaxMilliseconds test + int intValue = rand.nextInt(); + opt.setRateLimitDelayMaxMilliseconds(intValue); + assert(opt.rateLimitDelayMaxMilliseconds() == intValue); + } + + { // NoBlockCache test + boolean boolValue = rand.nextBoolean(); + opt.setNoBlockCache(boolValue); + assert(opt.noBlockCache() == boolValue); + } + + { // ArenaBlockSize test + long longValue = rand.nextLong(); + opt.setArenaBlockSize(longValue); + assert(opt.arenaBlockSize() == longValue); + } + + { // DisableAutoCompactions test + boolean boolValue = rand.nextBoolean(); + opt.setDisableAutoCompactions(boolValue); + assert(opt.disableAutoCompactions() == boolValue); + } + + { // PurgeRedundantKvsWhileFlush test + boolean boolValue = rand.nextBoolean(); + opt.setPurgeRedundantKvsWhileFlush(boolValue); + assert(opt.purgeRedundantKvsWhileFlush() == boolValue); + } + + { // BlockSizeDeviation test + int intValue = rand.nextInt(); + opt.setBlockSizeDeviation(intValue); + assert(opt.blockSizeDeviation() == intValue); + } + + { // VerifyChecksumsInCompaction test + boolean boolValue = rand.nextBoolean(); + opt.setVerifyChecksumsInCompaction(boolValue); + assert(opt.verifyChecksumsInCompaction() == boolValue); + } + + { // FilterDeletes test + boolean boolValue = rand.nextBoolean(); + opt.setFilterDeletes(boolValue); + assert(opt.filterDeletes() == boolValue); + } + + { // MaxSequentialSkipInIterations test + long longValue = rand.nextLong(); + opt.setMaxSequentialSkipInIterations(longValue); + assert(opt.maxSequentialSkipInIterations() == longValue); + } + + { // InplaceUpdateSupport test + boolean boolValue = rand.nextBoolean(); + opt.setInplaceUpdateSupport(boolValue); + assert(opt.inplaceUpdateSupport() == boolValue); + } + + { // InplaceUpdateNumLocks test + long longValue = rand.nextLong(); + opt.setInplaceUpdateNumLocks(longValue); + assert(opt.inplaceUpdateNumLocks() == longValue); + } + + { // MemtablePrefixBloomBits test + int intValue = rand.nextInt(); + opt.setMemtablePrefixBloomBits(intValue); + assert(opt.memtablePrefixBloomBits() == intValue); + } + + { // MemtablePrefixBloomProbes test + int intValue = rand.nextInt(); + opt.setMemtablePrefixBloomProbes(intValue); + assert(opt.memtablePrefixBloomProbes() == intValue); + } + + { // BloomLocality test + int intValue = rand.nextInt(); + opt.setBloomLocality(intValue); + assert(opt.bloomLocality() == intValue); + } + + { // MaxSuccessiveMerges test + long longValue = rand.nextLong(); + opt.setMaxSuccessiveMerges(longValue); + assert(opt.maxSuccessiveMerges() == longValue); + } + + { // MinPartialMergeOperands test + int intValue = rand.nextInt(); + opt.setMinPartialMergeOperands(intValue); + assert(opt.minPartialMergeOperands() == intValue); + } + opt.dispose(); System.out.println("Passed OptionsTest"); } diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index d9aef3d74..a05a74e7a 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -122,13 +122,13 @@ jlong Java_org_rocksdb_Options_statisticsPtr( /* * Class: org_rocksdb_Options - * Method: setFilter0 + * Method: setFilterHandle * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setFilter0( - JNIEnv* env, jobject jobj, jlong jopt_handle, jobject jfp) { +void Java_org_rocksdb_Options_setFilterHandle( + JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jfilter_handle) { reinterpret_cast(jopt_handle)->filter_policy = - rocksdb::FilterJni::getHandle(env, jfp); + reinterpret_cast(jfilter_handle); } /* @@ -602,15 +602,36 @@ jlong Java_org_rocksdb_Options_walTtlSeconds( /* * Class: org_rocksdb_Options - * Method: setWALTtlSeconds + * Method: setWalTtlSeconds * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setWALTtlSeconds( +void Java_org_rocksdb_Options_setWalTtlSeconds( JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_ttl_seconds) { reinterpret_cast(jhandle)->WAL_ttl_seconds = static_cast(WAL_ttl_seconds); } +/* + * Class: org_rocksdb_Options + * Method: walTtlSeconds + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_walSizeLimitMB( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->WAL_size_limit_MB; +} + +/* + * Class: org_rocksdb_Options + * Method: setWalSizeLimitMB + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setWalSizeLimitMB( + JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_size_limit_MB) { + reinterpret_cast(jhandle)->WAL_size_limit_MB = + static_cast(WAL_size_limit_MB); +} + /* * Class: org_rocksdb_Options * Method: manifestPreallocationSize @@ -870,6 +891,764 @@ jstring Java_org_rocksdb_Options_tableFactoryName( return env->NewStringUTF(tf->Name()); } + +/* + * Class: org_rocksdb_Options + * Method: minWriteBufferNumberToMerge + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_minWriteBufferNumberToMerge( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->min_write_buffer_number_to_merge; +} + +/* + * Class: org_rocksdb_Options + * Method: setMinWriteBufferNumberToMerge + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmin_write_buffer_number_to_merge) { + reinterpret_cast( + jhandle)->min_write_buffer_number_to_merge = + static_cast(jmin_write_buffer_number_to_merge); +} + +/* + * Class: org_rocksdb_Options + * Method: blockRestartInterval + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_blockRestartInterval( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->block_restart_interval; +} + +/* + * Class: org_rocksdb_Options + * Method: setBlockRestartInterval + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setBlockRestartInterval( + JNIEnv* env, jobject jobj, jlong jhandle, jint jblock_restart_interval) { + reinterpret_cast(jhandle)->block_restart_interval = + static_cast(jblock_restart_interval); +} + +/* + * Class: org_rocksdb_Options + * Method: wholeKeyFiltering + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_wholeKeyFiltering( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->whole_key_filtering; +} + +/* + * Class: org_rocksdb_Options + * Method: setWholeKeyFiltering + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setWholeKeyFiltering( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jwhole_key_filtering) { + reinterpret_cast(jhandle)->whole_key_filtering = + static_cast(jwhole_key_filtering); +} + +/* + * Class: org_rocksdb_Options + * Method: numLevels + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_numLevels( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->num_levels; +} + +/* + * Class: org_rocksdb_Options + * Method: setNumLevels + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setNumLevels( + JNIEnv* env, jobject jobj, jlong jhandle, jint jnum_levels) { + reinterpret_cast(jhandle)->num_levels = + static_cast(jnum_levels); +} + +/* + * Class: org_rocksdb_Options + * Method: levelZeroFileNumCompactionTrigger + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->level0_file_num_compaction_trigger; +} + +/* + * Class: org_rocksdb_Options + * Method: setLevelZeroFileNumCompactionTrigger + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setLevelZeroFileNumCompactionTrigger( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jlevel0_file_num_compaction_trigger) { + reinterpret_cast( + jhandle)->level0_file_num_compaction_trigger = + static_cast(jlevel0_file_num_compaction_trigger); +} + +/* + * Class: org_rocksdb_Options + * Method: levelZeroSlowdownWritesTrigger + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_levelZeroSlowdownWritesTrigger( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->level0_slowdown_writes_trigger; +} + +/* + * Class: org_rocksdb_Options + * Method: setLevelSlowdownWritesTrigger + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setLevelZeroSlowdownWritesTrigger( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jlevel0_slowdown_writes_trigger) { + reinterpret_cast( + jhandle)->level0_slowdown_writes_trigger = + static_cast(jlevel0_slowdown_writes_trigger); +} + +/* + * Class: org_rocksdb_Options + * Method: levelZeroStopWritesTrigger + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_levelZeroStopWritesTrigger( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->level0_stop_writes_trigger; +} + +/* + * Class: org_rocksdb_Options + * Method: setLevelStopWritesTrigger + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setLevelZeroStopWritesTrigger( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jlevel0_stop_writes_trigger) { + reinterpret_cast(jhandle)->level0_stop_writes_trigger = + static_cast(jlevel0_stop_writes_trigger); +} + +/* + * Class: org_rocksdb_Options + * Method: maxMemCompactionLevel + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_maxMemCompactionLevel( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_mem_compaction_level; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxMemCompactionLevel + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMaxMemCompactionLevel( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmax_mem_compaction_level) { + reinterpret_cast(jhandle)->max_mem_compaction_level = + static_cast(jmax_mem_compaction_level); +} + +/* + * Class: org_rocksdb_Options + * Method: targetFileSizeBase + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_targetFileSizeBase( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->target_file_size_base; +} + +/* + * Class: org_rocksdb_Options + * Method: setTargetFileSizeBase + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setTargetFileSizeBase( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jtarget_file_size_base) { + reinterpret_cast(jhandle)->target_file_size_base = + static_cast(jtarget_file_size_base); +} + +/* + * Class: org_rocksdb_Options + * Method: targetFileSizeMultiplier + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_targetFileSizeMultiplier( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->target_file_size_multiplier; +} + +/* + * Class: org_rocksdb_Options + * Method: setTargetFileSizeMultiplier + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setTargetFileSizeMultiplier( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jtarget_file_size_multiplier) { + reinterpret_cast( + jhandle)->target_file_size_multiplier = + static_cast(jtarget_file_size_multiplier); +} + +/* + * Class: org_rocksdb_Options + * Method: maxBytesForLevelBase + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_maxBytesForLevelBase( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_bytes_for_level_base; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxBytesForLevelBase + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setMaxBytesForLevelBase( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jmax_bytes_for_level_base) { + reinterpret_cast( + jhandle)->max_bytes_for_level_base = + static_cast(jmax_bytes_for_level_base); +} + +/* + * Class: org_rocksdb_Options + * Method: maxBytesForLevelMultiplier + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_maxBytesForLevelMultiplier( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_bytes_for_level_multiplier; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxBytesForLevelMultiplier + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplier( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmax_bytes_for_level_multiplier) { + reinterpret_cast( + jhandle)->max_bytes_for_level_multiplier = + static_cast(jmax_bytes_for_level_multiplier); +} + +/* + * Class: org_rocksdb_Options + * Method: expandedCompactionFactor + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_expandedCompactionFactor( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->expanded_compaction_factor; +} + +/* + * Class: org_rocksdb_Options + * Method: setExpandedCompactionFactor + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setExpandedCompactionFactor( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jexpanded_compaction_factor) { + reinterpret_cast( + jhandle)->expanded_compaction_factor = + static_cast(jexpanded_compaction_factor); +} + +/* + * Class: org_rocksdb_Options + * Method: sourceCompactionFactor + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_sourceCompactionFactor( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->source_compaction_factor; +} + +/* + * Class: org_rocksdb_Options + * Method: setSourceCompactionFactor + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setSourceCompactionFactor( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jsource_compaction_factor) { + reinterpret_cast( + jhandle)->source_compaction_factor = + static_cast(jsource_compaction_factor); +} + +/* + * Class: org_rocksdb_Options + * Method: maxGrandparentOverlapFactor + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_maxGrandparentOverlapFactor( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_grandparent_overlap_factor; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxGrandparentOverlapFactor + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMaxGrandparentOverlapFactor( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmax_grandparent_overlap_factor) { + reinterpret_cast( + jhandle)->max_grandparent_overlap_factor = + static_cast(jmax_grandparent_overlap_factor); +} + +/* + * Class: org_rocksdb_Options + * Method: softRateLimit + * Signature: (J)D + */ +jdouble Java_org_rocksdb_Options_softRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->soft_rate_limit; +} + +/* + * Class: org_rocksdb_Options + * Method: setSoftRateLimit + * Signature: (JD)V + */ +void Java_org_rocksdb_Options_setSoftRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle, jdouble jsoft_rate_limit) { + reinterpret_cast(jhandle)->soft_rate_limit = + static_cast(jsoft_rate_limit); +} + +/* + * Class: org_rocksdb_Options + * Method: hardRateLimit + * Signature: (J)D + */ +jdouble Java_org_rocksdb_Options_hardRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->hard_rate_limit; +} + +/* + * Class: org_rocksdb_Options + * Method: setHardRateLimit + * Signature: (JD)V + */ +void Java_org_rocksdb_Options_setHardRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle, jdouble jhard_rate_limit) { + reinterpret_cast(jhandle)->hard_rate_limit = + static_cast(jhard_rate_limit); +} + +/* + * Class: org_rocksdb_Options + * Method: rateLimitDelayMaxMilliseconds + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_rateLimitDelayMaxMilliseconds( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->rate_limit_delay_max_milliseconds; +} + +/* + * Class: org_rocksdb_Options + * Method: setRateLimitDelayMaxMilliseconds + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setRateLimitDelayMaxMilliseconds( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jrate_limit_delay_max_milliseconds) { + reinterpret_cast( + jhandle)->rate_limit_delay_max_milliseconds = + static_cast(jrate_limit_delay_max_milliseconds); +} + +/* + * Class: org_rocksdb_Options + * Method: noBlockCache + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_noBlockCache( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->no_block_cache; +} + +/* + * Class: org_rocksdb_Options + * Method: setNoBlockCache + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setNoBlockCache( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jno_block_cache) { + reinterpret_cast(jhandle)->no_block_cache = + static_cast(jno_block_cache); +} + +/* + * Class: org_rocksdb_Options + * Method: arenaBlockSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_arenaBlockSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->arena_block_size; +} + +/* + * Class: org_rocksdb_Options + * Method: setArenaBlockSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setArenaBlockSize( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) { + reinterpret_cast(jhandle)->arena_block_size = + static_cast(jarena_block_size); +} + +/* + * Class: org_rocksdb_Options + * Method: disableAutoCompactions + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_disableAutoCompactions( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->disable_auto_compactions; +} + +/* + * Class: org_rocksdb_Options + * Method: setDisableAutoCompactions + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setDisableAutoCompactions( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jdisable_auto_compactions) { + reinterpret_cast( + jhandle)->disable_auto_compactions = + static_cast(jdisable_auto_compactions); +} + +/* + * Class: org_rocksdb_Options + * Method: purgeRedundantKvsWhileFlush + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_purgeRedundantKvsWhileFlush( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->purge_redundant_kvs_while_flush; +} + +/* + * Class: org_rocksdb_Options + * Method: setPurgeRedundantKvsWhileFlush + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setPurgeRedundantKvsWhileFlush( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jpurge_redundant_kvs_while_flush) { + reinterpret_cast( + jhandle)->purge_redundant_kvs_while_flush = + static_cast(jpurge_redundant_kvs_while_flush); +} + +/* + * Class: org_rocksdb_Options + * Method: blockSizeDeviation + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_blockSizeDeviation( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->block_size_deviation; +} + +/* + * Class: org_rocksdb_Options + * Method: setBlockSizeDeviation + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setBlockSizeDeviation( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jblock_size_deviation) { + reinterpret_cast(jhandle)->block_size_deviation = + static_cast(jblock_size_deviation); +} + +/* + * Class: org_rocksdb_Options + * Method: verifyChecksumsInCompaction + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_verifyChecksumsInCompaction( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->verify_checksums_in_compaction; +} + +/* + * Class: org_rocksdb_Options + * Method: setVerifyChecksumsInCompaction + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setVerifyChecksumsInCompaction( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jverify_checksums_in_compaction) { + reinterpret_cast( + jhandle)->verify_checksums_in_compaction = + static_cast(jverify_checksums_in_compaction); +} + +/* + * Class: org_rocksdb_Options + * Method: filterDeletes + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_filterDeletes( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->filter_deletes; +} + +/* + * Class: org_rocksdb_Options + * Method: setFilterDeletes + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setFilterDeletes( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfilter_deletes) { + reinterpret_cast(jhandle)->filter_deletes = + static_cast(jfilter_deletes); +} + +/* + * Class: org_rocksdb_Options + * Method: maxSequentialSkipInIterations + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_sequential_skip_in_iterations; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxSequentialSkipInIterations + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setMaxSequentialSkipInIterations( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jmax_sequential_skip_in_iterations) { + reinterpret_cast( + jhandle)->max_sequential_skip_in_iterations = + static_cast(jmax_sequential_skip_in_iterations); +} + +/* + * Class: org_rocksdb_Options + * Method: inplaceUpdateSupport + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_inplaceUpdateSupport( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->inplace_update_support; +} + +/* + * Class: org_rocksdb_Options + * Method: setInplaceUpdateSupport + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setInplaceUpdateSupport( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jinplace_update_support) { + reinterpret_cast( + jhandle)->inplace_update_support = + static_cast(jinplace_update_support); +} + +/* + * Class: org_rocksdb_Options + * Method: inplaceUpdateNumLocks + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->inplace_update_num_locks; +} + +/* + * Class: org_rocksdb_Options + * Method: setInplaceUpdateNumLocks + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setInplaceUpdateNumLocks( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jinplace_update_num_locks) { + reinterpret_cast( + jhandle)->inplace_update_num_locks = + static_cast(jinplace_update_num_locks); +} + +/* + * Class: org_rocksdb_Options + * Method: memtablePrefixBloomBits + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_memtablePrefixBloomBits( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->memtable_prefix_bloom_bits; +} + +/* + * Class: org_rocksdb_Options + * Method: setMemtablePrefixBloomBits + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMemtablePrefixBloomBits( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmemtable_prefix_bloom_bits) { + reinterpret_cast( + jhandle)->memtable_prefix_bloom_bits = + static_cast(jmemtable_prefix_bloom_bits); +} + +/* + * Class: org_rocksdb_Options + * Method: memtablePrefixBloomProbes + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_memtablePrefixBloomProbes( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->memtable_prefix_bloom_probes; +} + +/* + * Class: org_rocksdb_Options + * Method: setMemtablePrefixBloomProbes + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMemtablePrefixBloomProbes( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmemtable_prefix_bloom_probes) { + reinterpret_cast( + jhandle)->memtable_prefix_bloom_probes = + static_cast(jmemtable_prefix_bloom_probes); +} + +/* + * Class: org_rocksdb_Options + * Method: bloomLocality + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_bloomLocality( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->bloom_locality; +} + +/* + * Class: org_rocksdb_Options + * Method: setBloomLocality + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setBloomLocality( + JNIEnv* env, jobject jobj, jlong jhandle, jint jbloom_locality) { + reinterpret_cast(jhandle)->bloom_locality = + static_cast(jbloom_locality); +} + +/* + * Class: org_rocksdb_Options + * Method: maxSuccessiveMerges + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_maxSuccessiveMerges( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->max_successive_merges; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxSuccessiveMerges + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setMaxSuccessiveMerges( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jmax_successive_merges) { + reinterpret_cast(jhandle)->max_successive_merges = + static_cast(jmax_successive_merges); +} + +/* + * Class: org_rocksdb_Options + * Method: minPartialMergeOperands + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_minPartialMergeOperands( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->min_partial_merge_operands; +} + +/* + * Class: org_rocksdb_Options + * Method: setMinPartialMergeOperands + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMinPartialMergeOperands( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmin_partial_merge_operands) { + reinterpret_cast( + jhandle)->min_partial_merge_operands = + static_cast(jmin_partial_merge_operands); +} + ////////////////////////////////////////////////////////////////////////////// // WriteOptions diff --git a/port/stack_trace.cc b/port/stack_trace.cc index aa01fd0cf..76866e63c 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -3,9 +3,19 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // -#include "util/stack_trace.h" +#include "port/stack_trace.h" -#ifdef OS_LINUX +namespace rocksdb { +namespace port { + +#if defined(ROCKSDB_LITE) || !(defined(OS_LINUX) || defined(OS_MACOSX)) + +// noop + +void InstallStackTraceHandler() {} +void PrintStack(int first_frames_to_skip) {} + +#else #include #include @@ -13,11 +23,12 @@ #include #include #include +#include -namespace rocksdb { +namespace { -static const char* GetExecutableName() -{ +#ifdef OS_LINUX +const char* GetExecutableName() { static char name[1024]; char link[1024]; @@ -31,38 +42,68 @@ static const char* GetExecutableName() } } +void PrintStackTraceLine(const char* symbol, void* frame) { + static const char* executable = GetExecutableName(); + if (symbol) { + fprintf(stderr, "%s ", symbol); + } + if (executable) { + // out source to addr2line, for the address translation + const int kLineMax = 256; + char cmd[kLineMax]; + snprintf(cmd, kLineMax, "addr2line %p -e %s -f -C 2>&1", frame, executable); + auto f = popen(cmd, "r"); + if (f) { + char line[kLineMax]; + while (fgets(line, sizeof(line), f)) { + line[strlen(line) - 1] = 0; // remove newline + fprintf(stderr, "%s\t", line); + } + pclose(f); + } + } else { + fprintf(stderr, " %p", frame); + } + + fprintf(stderr, "\n"); +} +#elif OS_MACOSX + +void PrintStackTraceLine(const char* symbol, void* frame) { + static int pid = getpid(); + // out source to atos, for the address translation + const int kLineMax = 256; + char cmd[kLineMax]; + snprintf(cmd, kLineMax, "xcrun atos %p -p %d 2>&1", frame, pid); + auto f = popen(cmd, "r"); + if (f) { + char line[kLineMax]; + while (fgets(line, sizeof(line), f)) { + line[strlen(line) - 1] = 0; // remove newline + fprintf(stderr, "%s\t", line); + } + pclose(f); + } else if (symbol) { + fprintf(stderr, "%s ", symbol); + } + + fprintf(stderr, "\n"); +} + +#endif + +} // namespace + void PrintStack(int first_frames_to_skip) { const int kMaxFrames = 100; - void *frames[kMaxFrames]; + void* frames[kMaxFrames]; auto num_frames = backtrace(frames, kMaxFrames); auto symbols = backtrace_symbols(frames, num_frames); - auto executable = GetExecutableName(); - for (int i = first_frames_to_skip; i < num_frames; ++i) { fprintf(stderr, "#%-2d ", i - first_frames_to_skip); - if (symbols) { - fprintf(stderr, "%s ", symbols[i]); - } - if (executable) { - // out source to addr2line, for the address translation - const int kLineMax = 256; - char cmd[kLineMax]; - sprintf(cmd, "addr2line %p -e %s -f -C 2>&1", frames[i], executable); - auto f = popen(cmd, "r"); - if (f) { - char line[kLineMax]; - while (fgets(line, sizeof(line), f)) { - line[strlen(line) - 1] = 0; // remove newline - fprintf(stderr, "%s\t", line); - } - pclose(f); - } - } else { - fprintf(stderr, " %p", frames[i]); - } - fprintf(stderr, "\n"); + PrintStackTraceLine((symbols != nullptr) ? symbols[i] : nullptr, frames[i]); } } @@ -83,20 +124,9 @@ void InstallStackTraceHandler() { signal(SIGSEGV, StackTraceHandler); signal(SIGBUS, StackTraceHandler); signal(SIGABRT, StackTraceHandler); - - printf("Installed stack trace handler for SIGILL SIGSEGV SIGBUS SIGABRT\n"); - } -} // namespace rocksdb - -#else // no-op for non-linux system for now - -namespace rocksdb { - -void InstallStackTraceHandler() {} -void PrintStack(int first_frames_to_skip) {} - -} +#endif -#endif // OS_LINUX +} // namespace port +} // namespace rocksdb diff --git a/util/stack_trace.h b/port/stack_trace.h similarity index 90% rename from util/stack_trace.h rename to port/stack_trace.h index 3b06e1df0..8bc6c7d2e 100644 --- a/util/stack_trace.h +++ b/port/stack_trace.h @@ -5,6 +5,7 @@ // #pragma once namespace rocksdb { +namespace port { // Install a signal handler to print callstack on the following signals: // SIGILL SIGSEGV SIGBUS SIGABRT @@ -14,4 +15,5 @@ void InstallStackTraceHandler(); // Prints stack, skips skip_first_frames frames void PrintStack(int first_frames_to_skip = 0); -} // namespace rocksdb +} // namespace port +} // namespace rocksdb diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 35f6a194c..f75726108 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -642,94 +642,6 @@ FilterBlockReader* BlockBasedTable::ReadFilter ( rep->options, block.data, block.heap_allocated); } -// Convert an index iterator value (i.e., an encoded BlockHandle) -// into an iterator over the contents of the corresponding block. -Iterator* BlockBasedTable::DataBlockReader(void* arg, - const ReadOptions& options, - const Slice& index_value, - bool* didIO, bool for_compaction) { - const bool no_io = (options.read_tier == kBlockCacheTier); - BlockBasedTable* table = reinterpret_cast(arg); - Cache* block_cache = table->rep_->options.block_cache.get(); - Cache* block_cache_compressed = table->rep_->options. - block_cache_compressed.get(); - CachableEntry block; - - BlockHandle handle; - Slice input = index_value; - // We intentionally allow extra stuff in index_value so that we - // can add more features in the future. - Status s = handle.DecodeFrom(&input); - - if (!s.ok()) { - return NewErrorIterator(s); - } - - // If either block cache is enabled, we'll try to read from it. - if (block_cache != nullptr || block_cache_compressed != nullptr) { - Statistics* statistics = table->rep_->options.statistics.get(); - char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - Slice key, /* key to the block cache */ - ckey /* key to the compressed block cache */; - - // create key for block cache - if (block_cache != nullptr) { - key = GetCacheKey(table->rep_->cache_key_prefix, - table->rep_->cache_key_prefix_size, handle, cache_key); - } - - if (block_cache_compressed != nullptr) { - ckey = GetCacheKey(table->rep_->compressed_cache_key_prefix, - table->rep_->compressed_cache_key_prefix_size, handle, - compressed_cache_key); - } - - s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, - statistics, options, &block); - - if (block.value == nullptr && !no_io && options.fill_cache) { - Histograms histogram = for_compaction ? - READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS; - Block* raw_block = nullptr; - { - StopWatch sw(table->rep_->options.env, statistics, histogram); - s = ReadBlockFromFile(table->rep_->file.get(), options, handle, - &raw_block, table->rep_->options.env, didIO, - block_cache_compressed == nullptr); - } - - if (s.ok()) { - s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed, - options, statistics, &block, raw_block); - } - } - } - - // Didn't get any data from block caches. - if (block.value == nullptr) { - if (no_io) { - // Could not read from block_cache and can't do IO - return NewErrorIterator(Status::Incomplete("no blocking io")); - } - s = ReadBlockFromFile(table->rep_->file.get(), options, handle, - &block.value, table->rep_->options.env, didIO); - } - - Iterator* iter; - if (block.value != nullptr) { - iter = block.value->NewIterator(&table->rep_->internal_comparator); - if (block.cache_handle != nullptr) { - iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, - block.cache_handle); - } else { - iter->RegisterCleanup(&DeleteHeldResource, block.value, nullptr); - } - } else { - iter = NewErrorIterator(s); - } - return iter; -} BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( bool no_io) const { @@ -838,13 +750,115 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) { return iter; } -Iterator* BlockBasedTable::DataBlockReader( - void* arg, const ReadOptions& options, const EnvOptions& soptions, - const InternalKeyComparator& icomparator, const Slice& index_value, - bool for_compaction) { - return DataBlockReader(arg, options, index_value, nullptr, for_compaction); +// Convert an index iterator value (i.e., an encoded BlockHandle) +// into an iterator over the contents of the corresponding block. +Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, + const ReadOptions& ro, bool* didIO, const Slice& index_value) { + const bool no_io = (ro.read_tier == kBlockCacheTier); + Cache* block_cache = rep->options.block_cache.get(); + Cache* block_cache_compressed = rep->options. + block_cache_compressed.get(); + CachableEntry block; + + BlockHandle handle; + Slice input = index_value; + // We intentionally allow extra stuff in index_value so that we + // can add more features in the future. + Status s = handle.DecodeFrom(&input); + + if (!s.ok()) { + return NewErrorIterator(s); + } + + // If either block cache is enabled, we'll try to read from it. + if (block_cache != nullptr || block_cache_compressed != nullptr) { + Statistics* statistics = rep->options.statistics.get(); + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + Slice key, /* key to the block cache */ + ckey /* key to the compressed block cache */; + + // create key for block cache + if (block_cache != nullptr) { + key = GetCacheKey(rep->cache_key_prefix, + rep->cache_key_prefix_size, handle, cache_key); + } + + if (block_cache_compressed != nullptr) { + ckey = GetCacheKey(rep->compressed_cache_key_prefix, + rep->compressed_cache_key_prefix_size, handle, + compressed_cache_key); + } + + s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, + statistics, ro, &block); + + if (block.value == nullptr && !no_io && ro.fill_cache) { + Histograms histogram = READ_BLOCK_GET_MICROS; + Block* raw_block = nullptr; + { + StopWatch sw(rep->options.env, statistics, histogram); + s = ReadBlockFromFile(rep->file.get(), ro, handle, + &raw_block, rep->options.env, didIO, + block_cache_compressed == nullptr); + } + + if (s.ok()) { + s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed, + ro, statistics, &block, raw_block); + } + } + } + + // Didn't get any data from block caches. + if (block.value == nullptr) { + if (no_io) { + // Could not read from block_cache and can't do IO + return NewErrorIterator(Status::Incomplete("no blocking io")); + } + s = ReadBlockFromFile(rep->file.get(), ro, handle, + &block.value, rep->options.env, didIO); + } + + Iterator* iter; + if (block.value != nullptr) { + iter = block.value->NewIterator(&rep->internal_comparator); + if (block.cache_handle != nullptr) { + iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, + block.cache_handle); + } else { + iter->RegisterCleanup(&DeleteHeldResource, block.value, nullptr); + } + } else { + iter = NewErrorIterator(s); + } + return iter; } +class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { + public: + BlockEntryIteratorState(BlockBasedTable* table, + const ReadOptions& read_options, bool* did_io) + : TwoLevelIteratorState(table->rep_->options.prefix_extractor != nullptr), + table_(table), read_options_(read_options), did_io_(did_io) {} + + Iterator* NewSecondaryIterator(const Slice& index_value) override { + return NewDataBlockIterator(table_->rep_, read_options_, did_io_, + index_value); + } + + bool PrefixMayMatch(const Slice& internal_key) override { + return table_->PrefixMayMatch(internal_key); + } + + private: + // Don't own table_ + BlockBasedTable* table_; + const ReadOptions read_options_; + // Don't own did_io_ + bool* did_io_; +}; + // This will be broken if the user specifies an unusual implementation // of Options.comparator, or if the user specifies an unusual // definition of prefixes in Options.filter_policy. In particular, we @@ -857,7 +871,13 @@ Iterator* BlockBasedTable::DataBlockReader( // Otherwise, this method guarantees no I/O will be incurred. // // REQUIRES: this method shouldn't be called while the DB lock is held. -bool BlockBasedTable::PrefixMayMatch(const Slice& internal_prefix) { +bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { + assert(rep_->options.prefix_extractor != nullptr); + auto prefix = rep_->options.prefix_extractor->Transform( + ExtractUserKey(internal_key)); + InternalKey internal_key_prefix(prefix, 0, kTypeValue); + auto internal_prefix = internal_key_prefix.Encode(); + bool may_match = true; Status s; @@ -918,20 +938,10 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_prefix) { return may_match; } -Iterator* BlockBasedTable::NewIterator(const ReadOptions& options) { - if (options.prefix) { - InternalKey internal_prefix(*options.prefix, 0, kTypeValue); - if (!PrefixMayMatch(internal_prefix.Encode())) { - // nothing in this file can match the prefix, so we should not - // bother doing I/O to this file when iterating. - return NewEmptyIterator(); - } - } - - return NewTwoLevelIterator(NewIndexIterator(options), - &BlockBasedTable::DataBlockReader, - const_cast(this), options, - rep_->soptions, rep_->internal_comparator); +Iterator* BlockBasedTable::NewIterator(const ReadOptions& read_options) { + return NewTwoLevelIterator(new BlockEntryIteratorState(this, read_options, + nullptr), + NewIndexIterator(read_options)); } Status BlockBasedTable::Get( @@ -962,7 +972,7 @@ Status BlockBasedTable::Get( } else { bool didIO = false; unique_ptr block_iter( - DataBlockReader(this, read_options, iiter->value(), &didIO)); + NewDataBlockIterator(rep_, read_options, &didIO, iiter->value())); if (read_options.read_tier && block_iter->status().IsIncomplete()) { // couldn't get block from block_cache @@ -1059,10 +1069,8 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) { return HashIndexReader::Create( file, index_handle, env, comparator, [&](Iterator* index_iter) { - return NewTwoLevelIterator( - index_iter, &BlockBasedTable::DataBlockReader, - const_cast(this), ReadOptions(), - rep_->soptions, rep_->internal_comparator); + return NewTwoLevelIterator(new BlockEntryIteratorState(this, + ReadOptions(), nullptr), index_iter); }, rep_->internal_prefix_transform.get(), index_reader); } diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index d48c5d2c7..fbe47272e 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -63,7 +63,7 @@ class BlockBasedTable : public TableReader { unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader); - bool PrefixMayMatch(const Slice& internal_prefix) override; + bool PrefixMayMatch(const Slice& internal_key); // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must @@ -111,13 +111,9 @@ class BlockBasedTable : public TableReader { Rep* rep_; bool compaction_optimized_; - static Iterator* DataBlockReader(void*, const ReadOptions&, - const EnvOptions& soptions, - const InternalKeyComparator& icomparator, - const Slice&, bool for_compaction); - - static Iterator* DataBlockReader(void*, const ReadOptions&, const Slice&, - bool* didIO, bool for_compaction = false); + struct BlockEntryIteratorState; + static Iterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro, + bool* didIO, const Slice& index_value); // For the following two functions: // if `no_io == true`, we will not try to read filter/index from sst file diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index b23620785..84af22fb9 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -2,8 +2,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #pragma once + +#ifndef ROCKSDB_LITE #include #include diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index ac0505a45..196201730 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -104,8 +104,8 @@ PlainTableReader::PlainTableReader( kHashTableRatio(hash_table_ratio), kBloomBitsPerKey(bloom_bits_per_key), kIndexIntervalForSamePrefixKeys(index_sparseness), - table_properties_(table_properties), - data_end_offset_(table_properties_->data_size), + table_properties_(nullptr), + data_end_offset_(table_properties->data_size), user_key_len_(table_properties->fixed_key_len) { assert(kHashTableRatio >= 0.0); } @@ -137,7 +137,7 @@ Status PlainTableReader::Open( bloom_bits_per_key, hash_table_ratio, index_sparseness, props)); // -- Populate Index - s = new_reader->PopulateIndex(); + s = new_reader->PopulateIndex(props); if (!s.ok()) { return s; } @@ -149,12 +149,8 @@ Status PlainTableReader::Open( void PlainTableReader::SetupForCompaction() { } -bool PlainTableReader::PrefixMayMatch(const Slice& internal_prefix) { - return true; -} - Iterator* PlainTableReader::NewIterator(const ReadOptions& options) { - return new PlainTableIterator(this, options.prefix_seek); + return new PlainTableIterator(this, options_.prefix_extractor != nullptr); } struct PlainTableReader::IndexRecord { @@ -364,7 +360,10 @@ void PlainTableReader::FillIndexes( index_size_, kSubIndexSize); } -Status PlainTableReader::PopulateIndex() { +Status PlainTableReader::PopulateIndex(TableProperties* props) { + assert(props != nullptr); + table_properties_.reset(props); + // options.prefix_extractor is requried for a hash-based look-up. if (options_.prefix_extractor.get() == nullptr && kHashTableRatio != 0) { return Status::NotSupported( @@ -409,6 +408,14 @@ Status PlainTableReader::PopulateIndex() { // From the temp data structure, populate indexes. FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket); + // Fill two table properties. + // TODO(sdong): after we have the feature of storing index in file, this + // properties need to be populated to index_size instead. + props->user_collected_properties["plain_table_hash_table_size"] = + std::to_string(index_size_ * 4U); + props->user_collected_properties["plain_table_sub_index_size"] = + std::to_string(sub_index_size_needed); + return Status::OK(); } diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index debb88372..756439b5c 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -2,8 +2,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #pragma once + +#ifndef ROCKSDB_LITE #include #include #include @@ -53,8 +54,6 @@ class PlainTableReader: public TableReader { const int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness); - bool PrefixMayMatch(const Slice& internal_prefix); - Iterator* NewIterator(const ReadOptions&); Status Get(const ReadOptions&, const Slice& key, void* arg, @@ -87,6 +86,9 @@ class PlainTableReader: public TableReader { // PopulateIndex() builds index of keys. It must be called before any query // to the table. // + // props: the table properties object that need to be stored. Ownership of + // the object will be passed. + // // index_ contains buckets size of index_size_, each is a // 32-bit integer. The lower 31 bits contain an offset value (explained below) // and the first bit of the integer indicates type of the offset. @@ -122,7 +124,7 @@ class PlainTableReader: public TableReader { // .... // record N file offset: fixedint32 // - Status PopulateIndex(); + Status PopulateIndex(TableProperties* props); private: struct IndexRecord; diff --git a/table/table_reader.h b/table/table_reader.h index 3d2738c9c..02a2d16dc 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -25,12 +25,6 @@ class TableReader { public: virtual ~TableReader() {} - // Determine whether there is a chance that the current table file - // contains the key a key starting with iternal_prefix. The specific - // table implementation can use bloom filter and/or other heuristic - // to filter out this table as a whole. - virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0; - // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index 32f6ee618..a0ff0d7f0 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -68,8 +68,6 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, bool through_db, bool measured_by_nanosecond) { rocksdb::InternalKeyComparator ikc(opts.comparator); - Slice prefix = Slice(); - std::string file_name = test::TmpDir() + "/rocksdb_table_reader_benchmark"; std::string dbname = test::TmpDir() + "/rocksdb_table_reader_bench_db"; @@ -156,10 +154,6 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, } std::string start_key = MakeKey(r1, r2, through_db); std::string end_key = MakeKey(r1, r2 + r2_len, through_db); - if (prefix_len < 16) { - prefix = Slice(start_key.data(), prefix_len); - read_options.prefix = &prefix; - } uint64_t total_time = 0; uint64_t start_time = Now(env, measured_by_nanosecond); port::MemoryBarrier(); @@ -254,7 +248,6 @@ int main(int argc, char** argv) { options.compression = rocksdb::CompressionType::kNoCompression; if (FLAGS_plain_table) { - ro.prefix_seek = true; options.allow_mmap_reads = true; env_options.use_mmap_reads = true; tf = new rocksdb::PlainTableFactory(16, (FLAGS_prefix_len == 16) ? 0 : 8, diff --git a/table/table_test.cc b/table/table_test.cc index 0426122ff..2e21c5064 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -307,11 +307,9 @@ class KeyConvertingIterator: public Iterator { class TableConstructor: public Constructor { public: explicit TableConstructor(const Comparator* cmp, - bool convert_to_internal_key = false, - bool prefix_seek = false) + bool convert_to_internal_key = false) : Constructor(cmp), - convert_to_internal_key_(convert_to_internal_key), - prefix_seek_(prefix_seek) {} + convert_to_internal_key_(convert_to_internal_key) {} ~TableConstructor() { Reset(); } virtual Status FinishImpl(const Options& options, @@ -352,9 +350,6 @@ class TableConstructor: public Constructor { virtual Iterator* NewIterator() const { ReadOptions ro; - if (prefix_seek_) { - ro.prefix_seek = true; - } Iterator* iter = table_reader_->NewIterator(ro); if (convert_to_internal_key_) { return new KeyConvertingIterator(iter); @@ -388,7 +383,6 @@ class TableConstructor: public Constructor { source_.reset(); } bool convert_to_internal_key_; - bool prefix_seek_; uint64_t uniq_id_; unique_ptr sink_; @@ -434,7 +428,7 @@ class MemTableConstructor: public Constructor { return Status::OK(); } virtual Iterator* NewIterator() const { - return new KeyConvertingIterator(memtable_->NewIterator()); + return new KeyConvertingIterator(memtable_->NewIterator(ReadOptions())); } private: @@ -699,7 +693,7 @@ class Harness { options_.prefix_extractor.reset(new FixedOrLessPrefixTransform(2)); options_.allow_mmap_reads = true; options_.table_factory.reset(NewPlainTableFactory()); - constructor_ = new TableConstructor(options_.comparator, true, true); + constructor_ = new TableConstructor(options_.comparator, true); internal_comparator_.reset( new InternalKeyComparator(options_.comparator)); break; @@ -709,7 +703,7 @@ class Harness { options_.prefix_extractor.reset(NewNoopTransform()); options_.allow_mmap_reads = true; options_.table_factory.reset(NewPlainTableFactory()); - constructor_ = new TableConstructor(options_.comparator, true, true); + constructor_ = new TableConstructor(options_.comparator, true); internal_comparator_.reset( new InternalKeyComparator(options_.comparator)); break; @@ -719,7 +713,7 @@ class Harness { options_.prefix_extractor = nullptr; options_.allow_mmap_reads = true; options_.table_factory.reset(NewTotalOrderPlainTableFactory()); - constructor_ = new TableConstructor(options_.comparator, true, false); + constructor_ = new TableConstructor(options_.comparator, true); internal_comparator_.reset( new InternalKeyComparator(options_.comparator)); break; @@ -1667,7 +1661,7 @@ TEST(MemTableTest, Simple) { ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options); ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok()); - Iterator* iter = memtable->NewIterator(); + Iterator* iter = memtable->NewIterator(ReadOptions()); iter->SeekToFirst(); while (iter->Valid()) { fprintf(stderr, "key: '%s' -> '%s'\n", diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index 65a58ad93..990f18184 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -13,26 +13,17 @@ #include "rocksdb/table.h" #include "table/block.h" #include "table/format.h" -#include "table/iterator_wrapper.h" namespace rocksdb { namespace { -typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, - const EnvOptions& soptions, - const InternalKeyComparator& icomparator, - const Slice&, bool for_compaction); - class TwoLevelIterator: public Iterator { public: - TwoLevelIterator(Iterator* index_iter, BlockFunction block_function, - void* arg, const ReadOptions& options, - const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - bool for_compaction); + explicit TwoLevelIterator(TwoLevelIteratorState* state, + Iterator* first_level_iter); - virtual ~TwoLevelIterator(); + virtual ~TwoLevelIterator() {} virtual void Seek(const Slice& target); virtual void SeekToFirst(); @@ -41,22 +32,23 @@ class TwoLevelIterator: public Iterator { virtual void Prev(); virtual bool Valid() const { - return data_iter_.Valid(); + return second_level_iter_.Valid(); } virtual Slice key() const { assert(Valid()); - return data_iter_.key(); + return second_level_iter_.key(); } virtual Slice value() const { assert(Valid()); - return data_iter_.value(); + return second_level_iter_.value(); } virtual Status status() const { // It'd be nice if status() returned a const Status& instead of a Status - if (!index_iter_.status().ok()) { - return index_iter_.status(); - } else if (data_iter_.iter() != nullptr && !data_iter_.status().ok()) { - return data_iter_.status(); + if (!first_level_iter_.status().ok()) { + return first_level_iter_.status(); + } else if (second_level_iter_.iter() != nullptr && + !second_level_iter_.status().ok()) { + return second_level_iter_.status(); } else { return status_; } @@ -68,135 +60,131 @@ class TwoLevelIterator: public Iterator { } void SkipEmptyDataBlocksForward(); void SkipEmptyDataBlocksBackward(); - void SetDataIterator(Iterator* data_iter); + void SetSecondLevelIterator(Iterator* iter); void InitDataBlock(); - BlockFunction block_function_; - void* arg_; - const ReadOptions options_; - const EnvOptions& soptions_; - const InternalKeyComparator& internal_comparator_; + std::unique_ptr state_; + IteratorWrapper first_level_iter_; + IteratorWrapper second_level_iter_; // May be nullptr Status status_; - IteratorWrapper index_iter_; - IteratorWrapper data_iter_; // May be nullptr - // If data_iter_ is non-nullptr, then "data_block_handle_" holds the - // "index_value" passed to block_function_ to create the data_iter_. + // If second_level_iter is non-nullptr, then "data_block_handle_" holds the + // "index_value" passed to block_function_ to create the second_level_iter. std::string data_block_handle_; - bool for_compaction_; }; -TwoLevelIterator::TwoLevelIterator( - Iterator* index_iter, BlockFunction block_function, void* arg, - const ReadOptions& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, bool for_compaction) - : block_function_(block_function), - arg_(arg), - options_(options), - soptions_(soptions), - internal_comparator_(internal_comparator), - index_iter_(index_iter), - data_iter_(nullptr), - for_compaction_(for_compaction) {} - -TwoLevelIterator::~TwoLevelIterator() { -} +TwoLevelIterator::TwoLevelIterator(TwoLevelIteratorState* state, + Iterator* first_level_iter) + : state_(state), first_level_iter_(first_level_iter) {} void TwoLevelIterator::Seek(const Slice& target) { - index_iter_.Seek(target); + if (state_->check_prefix_may_match && + !state_->PrefixMayMatch(target)) { + SetSecondLevelIterator(nullptr); + return; + } + first_level_iter_.Seek(target); + InitDataBlock(); - if (data_iter_.iter() != nullptr) data_iter_.Seek(target); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.Seek(target); + } SkipEmptyDataBlocksForward(); } void TwoLevelIterator::SeekToFirst() { - index_iter_.SeekToFirst(); + first_level_iter_.SeekToFirst(); InitDataBlock(); - if (data_iter_.iter() != nullptr) data_iter_.SeekToFirst(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToFirst(); + } SkipEmptyDataBlocksForward(); } void TwoLevelIterator::SeekToLast() { - index_iter_.SeekToLast(); + first_level_iter_.SeekToLast(); InitDataBlock(); - if (data_iter_.iter() != nullptr) data_iter_.SeekToLast(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToLast(); + } SkipEmptyDataBlocksBackward(); } void TwoLevelIterator::Next() { assert(Valid()); - data_iter_.Next(); + second_level_iter_.Next(); SkipEmptyDataBlocksForward(); } void TwoLevelIterator::Prev() { assert(Valid()); - data_iter_.Prev(); + second_level_iter_.Prev(); SkipEmptyDataBlocksBackward(); } void TwoLevelIterator::SkipEmptyDataBlocksForward() { - while (data_iter_.iter() == nullptr || (!data_iter_.Valid() && - !data_iter_.status().IsIncomplete())) { + while (second_level_iter_.iter() == nullptr || + (!second_level_iter_.Valid() && + !second_level_iter_.status().IsIncomplete())) { // Move to next block - if (!index_iter_.Valid()) { - SetDataIterator(nullptr); + if (!first_level_iter_.Valid()) { + SetSecondLevelIterator(nullptr); return; } - index_iter_.Next(); + first_level_iter_.Next(); InitDataBlock(); - if (data_iter_.iter() != nullptr) data_iter_.SeekToFirst(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToFirst(); + } } } void TwoLevelIterator::SkipEmptyDataBlocksBackward() { - while (data_iter_.iter() == nullptr || (!data_iter_.Valid() && - !data_iter_.status().IsIncomplete())) { + while (second_level_iter_.iter() == nullptr || + (!second_level_iter_.Valid() && + !second_level_iter_.status().IsIncomplete())) { // Move to next block - if (!index_iter_.Valid()) { - SetDataIterator(nullptr); + if (!first_level_iter_.Valid()) { + SetSecondLevelIterator(nullptr); return; } - index_iter_.Prev(); + first_level_iter_.Prev(); InitDataBlock(); - if (data_iter_.iter() != nullptr) data_iter_.SeekToLast(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToLast(); + } } } -void TwoLevelIterator::SetDataIterator(Iterator* data_iter) { - if (data_iter_.iter() != nullptr) SaveError(data_iter_.status()); - data_iter_.Set(data_iter); +void TwoLevelIterator::SetSecondLevelIterator(Iterator* iter) { + if (second_level_iter_.iter() != nullptr) { + SaveError(second_level_iter_.status()); + } + second_level_iter_.Set(iter); } void TwoLevelIterator::InitDataBlock() { - if (!index_iter_.Valid()) { - SetDataIterator(nullptr); + if (!first_level_iter_.Valid()) { + SetSecondLevelIterator(nullptr); } else { - Slice handle = index_iter_.value(); - if (data_iter_.iter() != nullptr + Slice handle = first_level_iter_.value(); + if (second_level_iter_.iter() != nullptr && handle.compare(data_block_handle_) == 0) { - // data_iter_ is already constructed with this iterator, so + // second_level_iter is already constructed with this iterator, so // no need to change anything } else { - Iterator* iter = - (*block_function_)(arg_, options_, soptions_, internal_comparator_, - handle, for_compaction_); + Iterator* iter = state_->NewSecondaryIterator(handle); data_block_handle_.assign(handle.data(), handle.size()); - SetDataIterator(iter); + SetSecondLevelIterator(iter); } } } } // namespace -Iterator* NewTwoLevelIterator(Iterator* index_iter, - BlockFunction block_function, void* arg, - const ReadOptions& options, - const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - bool for_compaction) { - return new TwoLevelIterator(index_iter, block_function, arg, options, - soptions, internal_comparator, for_compaction); +Iterator* NewTwoLevelIterator(TwoLevelIteratorState* state, + Iterator* first_level_iter) { + return new TwoLevelIterator(state, first_level_iter); } } // namespace rocksdb diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index d313dcb18..b8083385b 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -10,12 +10,26 @@ #pragma once #include "rocksdb/iterator.h" #include "rocksdb/env.h" +#include "table/iterator_wrapper.h" namespace rocksdb { struct ReadOptions; class InternalKeyComparator; +struct TwoLevelIteratorState { + explicit TwoLevelIteratorState(bool check_prefix_may_match) + : check_prefix_may_match(check_prefix_may_match) {} + + virtual ~TwoLevelIteratorState() {} + virtual Iterator* NewSecondaryIterator(const Slice& handle) = 0; + virtual bool PrefixMayMatch(const Slice& internal_key) = 0; + + // If call PrefixMayMatch() + bool check_prefix_may_match; +}; + + // Return a new two level iterator. A two-level iterator contains an // index iterator whose values point to a sequence of blocks where // each block is itself a sequence of key,value pairs. The returned @@ -25,14 +39,7 @@ class InternalKeyComparator; // // Uses a supplied function to convert an index_iter value into // an iterator over the contents of the corresponding block. -extern Iterator* NewTwoLevelIterator( - Iterator* index_iter, - Iterator* (*block_function)( - void* arg, const ReadOptions& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - const Slice& index_value, bool for_compaction), - void* arg, const ReadOptions& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - bool for_compaction = false); +extern Iterator* NewTwoLevelIterator(TwoLevelIteratorState* state, + Iterator* first_level_iter); } // namespace rocksdb diff --git a/tools/db_stress.cc b/tools/db_stress.cc index c7837c38b..8b8523f89 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -547,19 +547,20 @@ class SharedState { public: static const uint32_t SENTINEL; - explicit SharedState(StressTest* stress_test) : - cv_(&mu_), - seed_(FLAGS_seed), - max_key_(FLAGS_max_key), - log2_keys_per_lock_(FLAGS_log2_keys_per_lock), - num_threads_(FLAGS_threads), - num_initialized_(0), - num_populated_(0), - vote_reopen_(0), - num_done_(0), - start_(false), - start_verify_(false), - stress_test_(stress_test) { + explicit SharedState(StressTest* stress_test) + : cv_(&mu_), + seed_(FLAGS_seed), + max_key_(FLAGS_max_key), + log2_keys_per_lock_(FLAGS_log2_keys_per_lock), + num_threads_(FLAGS_threads), + num_initialized_(0), + num_populated_(0), + vote_reopen_(0), + num_done_(0), + start_(false), + start_verify_(false), + stress_test_(stress_test), + verification_failure_(false) { if (FLAGS_test_batches_snapshots) { fprintf(stdout, "No lock creation because test_batches_snapshots set\n"); return; @@ -651,6 +652,10 @@ class SharedState { return start_verify_; } + void SetVerificationFailure() { verification_failure_.store(true); } + + bool HasVerificationFailedYet() { return verification_failure_.load(); } + port::Mutex* GetMutexForKey(int cf, long key) { return &key_locks_[cf][key >> log2_keys_per_lock_]; } @@ -695,6 +700,7 @@ class SharedState { bool start_; bool start_verify_; StressTest* stress_test_; + std::atomic verification_failure_; std::vector> values_; std::vector> key_locks_; @@ -752,7 +758,7 @@ class StressTest { delete filter_policy_; } - void Run() { + bool Run() { PrintEnv(); Open(); SharedState shared(this); @@ -814,6 +820,12 @@ class StressTest { FLAGS_env->TimeToString((uint64_t) now/1000000).c_str()); } PrintStatistics(); + + if (shared.HasVerificationFailedYet()) { + printf("Verification failed :(\n"); + return false; + } + return true; } private: @@ -996,7 +1008,6 @@ class StressTest { prefixes[i].resize(FLAGS_prefix_size); prefix_slices[i] = Slice(prefixes[i]); readoptionscopy[i] = readoptions; - readoptionscopy[i].prefix_seek = true; readoptionscopy[i].snapshot = snapshot; iters[i] = db_->NewIterator(readoptionscopy[i], column_family); iters[i]->Seek(prefix_slices[i]); @@ -1062,7 +1073,6 @@ class StressTest { const Snapshot* snapshot = db_->GetSnapshot(); ReadOptions readoptionscopy = readoptions; readoptionscopy.snapshot = snapshot; - readoptionscopy.prefix_seek = FLAGS_prefix_size > 0; unique_ptr iter(db_->NewIterator(readoptionscopy, column_family)); iter->Seek(key); @@ -1101,7 +1111,10 @@ class StressTest { thread->stats.Start(); for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { - if(i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) { + if (thread->shared->HasVerificationFailedYet()) { + break; + } + if (i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) { { thread->stats.FinishedSingleOp(); MutexLock l(thread->shared->GetMutex()); @@ -1183,7 +1196,6 @@ class StressTest { // prefix if (!FLAGS_test_batches_snapshots) { Slice prefix = Slice(key.data(), FLAGS_prefix_size); - read_opts.prefix_seek = true; Iterator* iter = db_->NewIterator(read_opts, column_family); int64_t count = 0; for (iter->Seek(prefix); @@ -1211,8 +1223,10 @@ class StressTest { std::string keystr2 = Key(rand_key); Slice k = keystr2; Status s = db_->Get(read_opts, column_family, k, &from_db); - VerifyValue(rand_column_family, rand_key, read_opts, - *(thread->shared), from_db, s, true); + if (VerifyValue(rand_column_family, rand_key, read_opts, + thread->shared, from_db, s, true) == false) { + break; + } } thread->shared->Put(rand_column_family, rand_key, value_base); if (FLAGS_use_merge) { @@ -1246,22 +1260,27 @@ class StressTest { void VerifyDb(ThreadState* thread) const { ReadOptions options(FLAGS_verify_checksum, true); - const SharedState& shared = *(thread->shared); - static const long max_key = shared.GetMaxKey(); - static const long keys_per_thread = max_key / shared.GetNumThreads(); + auto shared = thread->shared; + static const long max_key = shared->GetMaxKey(); + static const long keys_per_thread = max_key / shared->GetNumThreads(); long start = keys_per_thread * thread->tid; long end = start + keys_per_thread; - if (thread->tid == shared.GetNumThreads() - 1) { + if (thread->tid == shared->GetNumThreads() - 1) { end = max_key; } for (size_t cf = 0; cf < column_families_.size(); ++cf) { + if (thread->shared->HasVerificationFailedYet()) { + break; + } if (!thread->rand.OneIn(2)) { // Use iterator to verify this range - options.prefix_seek = FLAGS_prefix_size > 0; unique_ptr iter( db_->NewIterator(options, column_families_[cf])); iter->Seek(Key(start)); for (long i = start; i < end; i++) { + if (thread->shared->HasVerificationFailedYet()) { + break; + } // TODO(ljin): update "long" to uint64_t // Reseek when the prefix changes if (i % (static_cast(1) << 8 * (8 - FLAGS_prefix_size)) == @@ -1279,7 +1298,7 @@ class StressTest { from_db = iter->value().ToString(); iter->Next(); } else if (iter->key().compare(k) < 0) { - VerificationAbort("An out of range key was found", cf, i); + VerificationAbort(shared, "An out of range key was found", cf, i); } } else { // The iterator found no value for the key in question, so do not @@ -1294,6 +1313,9 @@ class StressTest { } else { // Use Get to verify this range for (long i = start; i < end; i++) { + if (thread->shared->HasVerificationFailedYet()) { + break; + } std::string from_db; std::string keystr = Key(i); Slice k = keystr; @@ -1307,38 +1329,48 @@ class StressTest { } } - void VerificationAbort(std::string msg, int cf, long key) const { - fprintf(stderr, "Verification failed for column family %d key %ld: %s\n", - cf, key, msg.c_str()); - exit(1); + void VerificationAbort(SharedState* shared, std::string msg, int cf, + long key) const { + printf("Verification failed for column family %d key %ld: %s\n", cf, key, + msg.c_str()); + shared->SetVerificationFailure(); } - void VerifyValue(int cf, long key, const ReadOptions& opts, - const SharedState& shared, const std::string& value_from_db, + bool VerifyValue(int cf, long key, const ReadOptions& opts, + SharedState* shared, const std::string& value_from_db, Status s, bool strict = false) const { + if (shared->HasVerificationFailedYet()) { + return false; + } // compare value_from_db with the value in the shared state char value[100]; - uint32_t value_base = shared.Get(cf, key); + uint32_t value_base = shared->Get(cf, key); if (value_base == SharedState::SENTINEL && !strict) { - return; + return true; } if (s.ok()) { if (value_base == SharedState::SENTINEL) { - VerificationAbort("Unexpected value found", cf, key); + VerificationAbort(shared, "Unexpected value found", cf, key); + return false; } size_t sz = GenerateValue(value_base, value, sizeof(value)); if (value_from_db.length() != sz) { - VerificationAbort("Length of value read is not equal", cf, key); + VerificationAbort(shared, "Length of value read is not equal", cf, key); + return false; } if (memcmp(value_from_db.data(), value, sz) != 0) { - VerificationAbort("Contents of value read don't match", cf, key); + VerificationAbort(shared, "Contents of value read don't match", cf, + key); + return false; } } else { if (value_base != SharedState::SENTINEL) { - VerificationAbort("Value not found", cf, key); + VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key); + return false; } } + return true; } static void PrintKeyValue(int cf, uint32_t key, const char* value, @@ -1693,6 +1725,9 @@ int main(int argc, char** argv) { } rocksdb::StressTest stress; - stress.Run(); - return 0; + if (stress.Run()) { + return 0; + } else { + return 1; + } } diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc index 9e77afa3e..64aa2d9e8 100644 --- a/util/hash_linklist_rep.cc +++ b/util/hash_linklist_rep.cc @@ -73,9 +73,6 @@ class HashLinkListRep : public MemTableRep { virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override; - virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix) - override; - virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override; private: @@ -429,19 +426,14 @@ MemTableRep::Iterator* HashLinkListRep::GetIterator() { return new FullListIterator(list, new_arena); } -MemTableRep::Iterator* HashLinkListRep::GetPrefixIterator( - const Slice& prefix) { - auto bucket = GetBucket(prefix); +MemTableRep::Iterator* HashLinkListRep::GetIterator(const Slice& slice) { + auto bucket = GetBucket(transform_->Transform(slice)); if (bucket == nullptr) { return new EmptyIterator(); } return new Iterator(this, bucket); } -MemTableRep::Iterator* HashLinkListRep::GetIterator(const Slice& slice) { - return GetPrefixIterator(transform_->Transform(slice)); -} - MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator() { return new DynamicIterator(*this); } diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc index e27ec5949..21df9f62b 100644 --- a/util/hash_skiplist_rep.cc +++ b/util/hash_skiplist_rep.cc @@ -42,9 +42,6 @@ class HashSkipListRep : public MemTableRep { virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override; - virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix) - override; - virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override; private: @@ -307,18 +304,14 @@ MemTableRep::Iterator* HashSkipListRep::GetIterator() { return new Iterator(list, true, new_arena); } -MemTableRep::Iterator* HashSkipListRep::GetPrefixIterator(const Slice& prefix) { - auto bucket = GetBucket(prefix); +MemTableRep::Iterator* HashSkipListRep::GetIterator(const Slice& slice) { + auto bucket = GetBucket(transform_->Transform(slice)); if (bucket == nullptr) { return new EmptyIterator(); } return new Iterator(bucket, false); } -MemTableRep::Iterator* HashSkipListRep::GetIterator(const Slice& slice) { - return GetPrefixIterator(transform_->Transform(slice)); -} - MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() { return new DynamicIterator(*this); } diff --git a/util/signal_test.cc b/util/signal_test.cc index d3446818d..f51fa548e 100644 --- a/util/signal_test.cc +++ b/util/signal_test.cc @@ -3,7 +3,7 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // -#include "util/stack_trace.h" +#include "port/stack_trace.h" #include namespace { @@ -26,7 +26,7 @@ void f3() { } // namespace int main() { - rocksdb::InstallStackTraceHandler(); + rocksdb::port::InstallStackTraceHandler(); f3(); diff --git a/util/testharness.cc b/util/testharness.cc index 85716cdae..4208d2c46 100644 --- a/util/testharness.cc +++ b/util/testharness.cc @@ -8,11 +8,11 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "util/testharness.h" - #include #include #include #include +#include "port/stack_trace.h" namespace rocksdb { namespace test { @@ -39,6 +39,8 @@ bool RegisterTest(const char* base, const char* name, void (*func)()) { } int RunAllTests() { + port::InstallStackTraceHandler(); + const char* matcher = getenv("ROCKSDB_TESTS"); int num = 0; diff --git a/util/testharness.h b/util/testharness.h index f15917816..52c29848d 100644 --- a/util/testharness.h +++ b/util/testharness.h @@ -12,10 +12,10 @@ #include #include #include +#include "port/stack_trace.h" #include "rocksdb/env.h" #include "rocksdb/slice.h" #include "util/random.h" -#include "util/stack_trace.h" namespace rocksdb { namespace test { @@ -59,7 +59,7 @@ class Tester { ~Tester() { if (!ok_) { fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str()); - PrintStack(2); + port::PrintStack(2); exit(1); } } diff --git a/util/thread_local.cc b/util/thread_local.cc index 1b4220b8f..bc8a4c7d2 100644 --- a/util/thread_local.cc +++ b/util/thread_local.cc @@ -14,20 +14,14 @@ namespace rocksdb { -std::unique_ptr ThreadLocalPtr::StaticMeta::inst_; port::Mutex ThreadLocalPtr::StaticMeta::mutex_; #if !defined(OS_MACOSX) __thread ThreadLocalPtr::ThreadData* ThreadLocalPtr::StaticMeta::tls_ = nullptr; #endif -ThreadLocalPtr::StaticMeta* ThreadLocalPtr::StaticMeta::Instance() { - if (UNLIKELY(inst_ == nullptr)) { - MutexLock l(&mutex_); - if (inst_ == nullptr) { - inst_.reset(new StaticMeta()); - } - } - return inst_.get(); +ThreadLocalPtr::StaticMeta* ThreadLocalPtr::Instance() { + static ThreadLocalPtr::StaticMeta inst; + return &inst; } void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) { @@ -216,34 +210,34 @@ void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) { } ThreadLocalPtr::ThreadLocalPtr(UnrefHandler handler) - : id_(StaticMeta::Instance()->GetId()) { + : id_(Instance()->GetId()) { if (handler != nullptr) { - StaticMeta::Instance()->SetHandler(id_, handler); + Instance()->SetHandler(id_, handler); } } ThreadLocalPtr::~ThreadLocalPtr() { - StaticMeta::Instance()->ReclaimId(id_); + Instance()->ReclaimId(id_); } void* ThreadLocalPtr::Get() const { - return StaticMeta::Instance()->Get(id_); + return Instance()->Get(id_); } void ThreadLocalPtr::Reset(void* ptr) { - StaticMeta::Instance()->Reset(id_, ptr); + Instance()->Reset(id_, ptr); } void* ThreadLocalPtr::Swap(void* ptr) { - return StaticMeta::Instance()->Swap(id_, ptr); + return Instance()->Swap(id_, ptr); } bool ThreadLocalPtr::CompareAndSwap(void* ptr, void*& expected) { - return StaticMeta::Instance()->CompareAndSwap(id_, ptr, expected); + return Instance()->CompareAndSwap(id_, ptr, expected); } void ThreadLocalPtr::Scrape(autovector* ptrs, void* const replacement) { - StaticMeta::Instance()->Scrape(id_, ptrs, replacement); + Instance()->Scrape(id_, ptrs, replacement); } } // namespace rocksdb diff --git a/util/thread_local.h b/util/thread_local.h index a7728ed64..a037a9ceb 100644 --- a/util/thread_local.h +++ b/util/thread_local.h @@ -89,7 +89,7 @@ class ThreadLocalPtr { class StaticMeta { public: - static StaticMeta* Instance(); + StaticMeta(); // Return the next available Id uint32_t GetId(); @@ -117,8 +117,6 @@ class ThreadLocalPtr { void SetHandler(uint32_t id, UnrefHandler handler); private: - StaticMeta(); - // Get UnrefHandler for id with acquiring mutex // REQUIRES: mutex locked UnrefHandler GetHandler(uint32_t id); @@ -136,9 +134,6 @@ class ThreadLocalPtr { static ThreadData* GetThreadLocal(); - // Singleton instance - static std::unique_ptr inst_; - uint32_t next_instance_id_; // Used to recycle Ids in case ThreadLocalPtr is instantiated and destroyed // frequently. This also prevents it from blowing up the vector space. @@ -163,6 +158,8 @@ class ThreadLocalPtr { pthread_key_t pthread_key_; }; + static StaticMeta* Instance(); + const uint32_t id_; }; diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc index d273947a8..70dfa956e 100644 --- a/util/thread_local_test.cc +++ b/util/thread_local_test.cc @@ -49,7 +49,7 @@ struct Params { class IDChecker : public ThreadLocalPtr { public: - static uint32_t PeekId() { return StaticMeta::Instance()->PeekId(); } + static uint32_t PeekId() { return Instance()->PeekId(); } }; } // anonymous namespace diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 26ffcb456..ca1fb504a 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -87,7 +87,8 @@ void BackupableDBOptions::Dump(Logger* logger) const { // -------- BackupEngineImpl class --------- class BackupEngineImpl : public BackupEngine { public: - BackupEngineImpl(Env* db_env, const BackupableDBOptions& options); + BackupEngineImpl(Env* db_env, const BackupableDBOptions& options, + bool read_only = false); ~BackupEngineImpl(); Status CreateNewBackup(DB* db, bool flush_before_backup = false); Status PurgeOldBackups(uint32_t num_backups_to_keep); @@ -149,7 +150,7 @@ class BackupEngineImpl : public BackupEngine { Status AddFile(const FileInfo& file_info); - void Delete(); + void Delete(bool delete_meta = true); bool Empty() { return files_.empty(); @@ -258,6 +259,7 @@ class BackupEngineImpl : public BackupEngine { static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL; // 5MB size_t copy_file_buffer_size_; + bool read_only_; }; BackupEngine* BackupEngine::NewBackupEngine( @@ -266,27 +268,34 @@ BackupEngine* BackupEngine::NewBackupEngine( } BackupEngineImpl::BackupEngineImpl(Env* db_env, - const BackupableDBOptions& options) + const BackupableDBOptions& options, + bool read_only) : stop_backup_(false), options_(options), db_env_(db_env), backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_), - copy_file_buffer_size_(kDefaultCopyFileBufferSize) { + copy_file_buffer_size_(kDefaultCopyFileBufferSize), + read_only_(read_only) { + if (read_only_) { + Log(options_.info_log, "Starting read_only backup engine"); + } options_.Dump(options_.info_log); - // create all the dirs we need - backup_env_->CreateDirIfMissing(GetAbsolutePath()); - backup_env_->NewDirectory(GetAbsolutePath(), &backup_directory_); - if (options_.share_table_files) { - backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel())); - backup_env_->NewDirectory(GetAbsolutePath(GetSharedFileRel()), - &shared_directory_); + if (!read_only_) { + // create all the dirs we need + backup_env_->CreateDirIfMissing(GetAbsolutePath()); + backup_env_->NewDirectory(GetAbsolutePath(), &backup_directory_); + if (options_.share_table_files) { + backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel())); + backup_env_->NewDirectory(GetAbsolutePath(GetSharedFileRel()), + &shared_directory_); + } + backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel())); + backup_env_->NewDirectory(GetAbsolutePath(GetPrivateDirRel()), + &private_directory_); + backup_env_->CreateDirIfMissing(GetBackupMetaDir()); + backup_env_->NewDirectory(GetBackupMetaDir(), &meta_directory_); } - backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel())); - backup_env_->NewDirectory(GetAbsolutePath(GetPrivateDirRel()), - &private_directory_); - backup_env_->CreateDirIfMissing(GetBackupMetaDir()); - backup_env_->NewDirectory(GetBackupMetaDir(), &meta_directory_); std::vector backup_meta_files; backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files); @@ -295,8 +304,10 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env, BackupID backup_id = 0; sscanf(file.c_str(), "%u", &backup_id); if (backup_id == 0 || file != std::to_string(backup_id)) { - // invalid file name, delete that - backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file); + if (!read_only_) { + // invalid file name, delete that + backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file); + } continue; } assert(backups_.find(backup_id) == backups_.end()); @@ -306,6 +317,7 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env, } if (options_.destroy_old_data) { // Destory old data + assert(!read_only_); for (auto& backup : backups_) { backup.second.Delete(); obsolete_backups_.push_back(backup.first); @@ -319,9 +331,12 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env, for (auto& backup : backups_) { Status s = backup.second.LoadFromFile(options_.backup_dir); if (!s.ok()) { - Log(options_.info_log, "Backup %u corrupted - deleting -- %s", - backup.first, s.ToString().c_str()); - backup.second.Delete(); + Log(options_.info_log, "Backup %u corrupted -- %s", backup.first, + s.ToString().c_str()); + if (!read_only_) { + Log(options_.info_log, "-> Deleting backup %u", backup.first); + } + backup.second.Delete(!read_only_); obsolete_backups_.push_back(backup.first); } } @@ -331,6 +346,7 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env, } Status s = GetLatestBackupFileContents(&latest_backup_id_); + // If latest backup file is corrupted or non-existent // set latest backup as the biggest backup we have // or 0 if we have no backups @@ -349,16 +365,18 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env, itr = backups_.erase(itr); } - PutLatestBackupFileContents(latest_backup_id_); // Ignore errors - GarbageCollection(true); - Log(options_.info_log, - "Initialized BackupEngine, the latest backup is %u.", + if (!read_only_) { + PutLatestBackupFileContents(latest_backup_id_); // Ignore errors + GarbageCollection(true); + } + Log(options_.info_log, "Initialized BackupEngine, the latest backup is %u.", latest_backup_id_); } BackupEngineImpl::~BackupEngineImpl() { LogFlush(options_.info_log); } Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { + assert(!read_only_); Status s; std::vector live_files; VectorLogPtr live_wal_files; @@ -499,6 +517,7 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { } Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) { + assert(!read_only_); Log(options_.info_log, "Purging old backups, keeping %u", num_backups_to_keep); while (num_backups_to_keep < backups_.size()) { @@ -512,6 +531,7 @@ Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) { } Status BackupEngineImpl::DeleteBackup(BackupID backup_id) { + assert(!read_only_); Log(options_.info_log, "Deleting backup %u", backup_id); auto backup = backups_.find(backup_id); if (backup == backups_.end()) { @@ -662,6 +682,7 @@ Status BackupEngineImpl::GetLatestBackupFileContents(uint32_t* latest_backup) { // do something like 1. delete file, 2. write new file // We write to a tmp file and then atomically rename Status BackupEngineImpl::PutLatestBackupFileContents(uint32_t latest_backup) { + assert(!read_only_); Status s; unique_ptr file; EnvOptions env_options; @@ -871,6 +892,7 @@ void BackupEngineImpl::DeleteChildren(const std::string& dir, } void BackupEngineImpl::GarbageCollection(bool full_scan) { + assert(!read_only_); Log(options_.info_log, "Starting garbage collection"); std::vector to_delete; for (auto& itr : backuped_file_infos_) { @@ -973,7 +995,7 @@ Status BackupEngineImpl::BackupMeta::AddFile(const FileInfo& file_info) { return Status::OK(); } -void BackupEngineImpl::BackupMeta::Delete() { +void BackupEngineImpl::BackupMeta::Delete(bool delete_meta) { for (const auto& file : files_) { auto itr = file_infos_->find(file); assert(itr != file_infos_->end()); @@ -981,7 +1003,9 @@ void BackupEngineImpl::BackupMeta::Delete() { } files_.clear(); // delete meta file - env_->DeleteFile(meta_filename_); + if (delete_meta) { + env_->DeleteFile(meta_filename_); + } timestamp_ = 0; } @@ -1107,6 +1131,45 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) { return s; } +// -------- BackupEngineReadOnlyImpl --------- +class BackupEngineReadOnlyImpl : public BackupEngineReadOnly { + public: + BackupEngineReadOnlyImpl(Env* db_env, const BackupableDBOptions& options) { + backup_engine_ = new BackupEngineImpl(db_env, options, true); + } + virtual ~BackupEngineReadOnlyImpl() {} + + virtual void GetBackupInfo(std::vector* backup_info) { + backup_engine_->GetBackupInfo(backup_info); + } + + virtual Status RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) { + return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir, + restore_options); + } + + virtual Status RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) { + return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir, + restore_options); + } + + private: + BackupEngineImpl* backup_engine_; +}; + +BackupEngineReadOnly* BackupEngineReadOnly::NewReadOnlyBackupEngine( + Env* db_env, const BackupableDBOptions& options) { + if (options.destroy_old_data) { + assert(false); + return nullptr; + } + return new BackupEngineReadOnlyImpl(db_env, options); +} + // --- BackupableDB methods -------- BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options) diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index f6ffd9487..563800556 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -178,6 +178,12 @@ class TestEnv : public EnvWrapper { return EnvWrapper::NewWritableFile(f, r, options); } + virtual Status DeleteFile(const std::string& fname) override { + ASSERT_GT(limit_delete_files_, 0); + limit_delete_files_--; + return EnvWrapper::DeleteFile(fname); + } + void AssertWrittenFiles(std::vector& should_have_written) { sort(should_have_written.begin(), should_have_written.end()); sort(written_files_.begin(), written_files_.end()); @@ -192,6 +198,8 @@ class TestEnv : public EnvWrapper { limit_written_files_ = limit; } + void SetLimitDeleteFiles(uint64_t limit) { limit_delete_files_ = limit; } + void SetDummySequentialFile(bool dummy_sequential_file) { dummy_sequential_file_ = dummy_sequential_file; } @@ -200,7 +208,8 @@ class TestEnv : public EnvWrapper { bool dummy_sequential_file_ = false; std::vector written_files_; uint64_t limit_written_files_ = 1000000; -}; // TestEnv + uint64_t limit_delete_files_ = 1000000; +}; // TestEnv class FileManager : public EnvWrapper { public: @@ -864,7 +873,38 @@ TEST(BackupableDBTest, RateLimiting) { } } -} // anon namespace +TEST(BackupableDBTest, ReadOnlyBackupEngine) { + DestroyDB(dbname_, Options()); + OpenBackupableDB(true); + FillDB(db_.get(), 0, 100); + ASSERT_OK(db_->CreateNewBackup(true)); + FillDB(db_.get(), 100, 200); + ASSERT_OK(db_->CreateNewBackup(true)); + CloseBackupableDB(); + DestroyDB(dbname_, Options()); + + backupable_options_->destroy_old_data = false; + test_backup_env_->ClearWrittenFiles(); + test_backup_env_->SetLimitDeleteFiles(0); + auto read_only_backup_engine = + BackupEngineReadOnly::NewReadOnlyBackupEngine(env_, *backupable_options_); + std::vector backup_info; + read_only_backup_engine->GetBackupInfo(&backup_info); + ASSERT_EQ(backup_info.size(), 2U); + + RestoreOptions restore_options(false); + ASSERT_OK(read_only_backup_engine->RestoreDBFromLatestBackup( + dbname_, dbname_, restore_options)); + delete read_only_backup_engine; + std::vector should_have_written; + test_backup_env_->AssertWrittenFiles(should_have_written); + + DB* db = OpenDB(); + AssertExists(db, 0, 200); + delete db; +} + +} // anon namespace } // namespace rocksdb