diff --git a/CMakeLists.txt b/CMakeLists.txt index d87ff931b..63d93c29f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -293,6 +293,10 @@ set(SOURCES db/dbformat.cc db/db_filesnapshot.cc db/db_impl.cc + db/db_impl_write.cc + db/db_impl_compaction_flush.cc + db/db_impl_files.cc + db/db_impl_open.cc db/db_impl_debug.cc db/db_impl_experimental.cc db/db_impl_readonly.cc diff --git a/db/db_impl.cc b/db/db_impl.cc index 8c9aa5f12..3843a4d80 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -69,7 +69,6 @@ #include "rocksdb/status.h" #include "rocksdb/table.h" #include "rocksdb/version.h" -#include "rocksdb/wal_filter.h" #include "rocksdb/write_buffer_manager.h" #include "table/block.h" #include "table/block_based_table_factory.h" @@ -101,169 +100,9 @@ #include "util/thread_status_util.h" namespace rocksdb { - const std::string kDefaultColumnFamilyName("default"); - void DumpRocksDBBuildVersion(Logger * log); -Options SanitizeOptions(const std::string& dbname, - const Options& src) { - auto db_options = SanitizeOptions(dbname, DBOptions(src)); - ImmutableDBOptions immutable_db_options(db_options); - auto cf_options = - SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src)); - return Options(db_options, cf_options); -} - -DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { - DBOptions result(src); - - // result.max_open_files means an "infinite" open files. - if (result.max_open_files != -1) { - int max_max_open_files = port::GetMaxOpenFiles(); - if (max_max_open_files == -1) { - max_max_open_files = 1000000; - } - ClipToRange(&result.max_open_files, 20, max_max_open_files); - } - - if (result.info_log == nullptr) { - Status s = CreateLoggerFromOptions(dbname, result, &result.info_log); - if (!s.ok()) { - // No place suitable for logging - result.info_log = nullptr; - } - } - - if (!result.write_buffer_manager) { - result.write_buffer_manager.reset( - new WriteBufferManager(result.db_write_buffer_size)); - } - if (result.base_background_compactions == -1) { - result.base_background_compactions = result.max_background_compactions; - } - if (result.base_background_compactions > result.max_background_compactions) { - result.base_background_compactions = result.max_background_compactions; - } - result.env->IncBackgroundThreadsIfNeeded(src.max_background_compactions, - Env::Priority::LOW); - result.env->IncBackgroundThreadsIfNeeded(src.max_background_flushes, - Env::Priority::HIGH); - - if (result.rate_limiter.get() != nullptr) { - if (result.bytes_per_sync == 0) { - result.bytes_per_sync = 1024 * 1024; - } - } - - if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) { - result.recycle_log_file_num = false; - } - - if (result.recycle_log_file_num && - (result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery || - result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) { - // kPointInTimeRecovery is indistinguishable from - // kTolerateCorruptedTailRecords in recycle mode since we define - // the "end" of the log as the first corrupt record we encounter. - // kAbsoluteConsistency doesn't make sense because even a clean - // shutdown leaves old junk at the end of the log file. - result.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; - } - - if (result.wal_dir.empty()) { - // Use dbname as default - result.wal_dir = dbname; - } - if (result.wal_dir.back() == '/') { - result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); - } - - if (result.db_paths.size() == 0) { - result.db_paths.emplace_back(dbname, std::numeric_limits::max()); - } - - if (result.use_direct_reads && result.compaction_readahead_size == 0) { - result.compaction_readahead_size = 1024 * 1024 * 2; - } - - if (result.compaction_readahead_size > 0) { - result.new_table_reader_for_compaction_inputs = true; - } - - // Force flush on DB open if 2PC is enabled, since with 2PC we have no - // guarantee that consecutive log files have consecutive sequence id, which - // make recovery complicated. - if (result.allow_2pc) { - result.avoid_flush_during_recovery = false; - } - - return result; -} - -namespace { - -Status SanitizeOptionsByTable( - const DBOptions& db_opts, - const std::vector& column_families) { - Status s; - for (auto cf : column_families) { - s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options); - if (!s.ok()) { - return s; - } - } - return Status::OK(); -} - -static Status ValidateOptions( - const DBOptions& db_options, - const std::vector& column_families) { - Status s; - - for (auto& cfd : column_families) { - s = CheckCompressionSupported(cfd.options); - if (s.ok() && db_options.allow_concurrent_memtable_write) { - s = CheckConcurrentWritesSupported(cfd.options); - } - if (!s.ok()) { - return s; - } - if (db_options.db_paths.size() > 1) { - if ((cfd.options.compaction_style != kCompactionStyleUniversal) && - (cfd.options.compaction_style != kCompactionStyleLevel)) { - return Status::NotSupported( - "More than one DB paths are only supported in " - "universal and level compaction styles. "); - } - } - } - - if (db_options.db_paths.size() > 4) { - return Status::NotSupported( - "More than four DB paths are not supported yet. "); - } - - if (db_options.allow_mmap_reads && db_options.use_direct_reads) { - // Protect against assert in PosixMMapReadableFile constructor - return Status::NotSupported( - "If memory mapped reads (allow_mmap_reads) are enabled " - "then direct I/O reads (use_direct_reads) must be disabled. "); - } - - if (db_options.allow_mmap_writes && db_options.use_direct_writes) { - return Status::NotSupported( - "If memory mapped writes (allow_mmap_writes) are enabled " - "then direct I/O writes (use_direct_writes) must be disabled. "); - } - - if (db_options.keep_log_file_num == 0) { - return Status::InvalidArgument("keep_log_file_num must be greater than 0"); - } - - return Status::OK(); -} - CompressionType GetCompressionFlush( const ImmutableCFOptions& ioptions, const MutableCFOptions& mutable_cf_options) { @@ -284,6 +123,7 @@ CompressionType GetCompressionFlush( } } +namespace { void DumpSupportInfo(Logger* logger) { ROCKS_LOG_HEADER(logger, "Compression algorithms supported:"); ROCKS_LOG_HEADER(logger, "\tSnappy supported: %d", Snappy_Supported()); @@ -294,8 +134,7 @@ void DumpSupportInfo(Logger* logger) { ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %d", crc32c::IsFastCrc32Supported()); } - -} // namespace +} // namespace DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) : env_(options.env), @@ -505,44 +344,6 @@ DBImpl::~DBImpl() { LogFlush(immutable_db_options_.info_log); } -Status DBImpl::NewDB() { - VersionEdit new_db; - new_db.SetLogNumber(0); - new_db.SetNextFile(2); - new_db.SetLastSequence(0); - - Status s; - - ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n"); - const std::string manifest = DescriptorFileName(dbname_, 1); - { - unique_ptr file; - EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_); - s = NewWritableFile(env_, manifest, &file, env_options); - if (!s.ok()) { - return s; - } - file->SetPreallocationBlockSize( - immutable_db_options_.manifest_preallocation_size); - unique_ptr file_writer( - new WritableFileWriter(std::move(file), env_options)); - log::Writer log(std::move(file_writer), 0, false); - std::string record; - new_db.EncodeTo(&record); - s = log.AddRecord(record); - if (s.ok()) { - s = SyncManifest(env_, &immutable_db_options_, log.file()); - } - } - if (s.ok()) { - // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir()); - } else { - env_->DeleteFile(manifest); - } - return s; -} - void DBImpl::MaybeIgnoreError(Status* s) const { if (s->ok() || immutable_db_options_.paranoid_checks) { // No change needed @@ -656,88 +457,6 @@ void DBImpl::MaybeDumpStats() { } } -uint64_t DBImpl::FindMinPrepLogReferencedByMemTable() { - if (!allow_2pc()) { - return 0; - } - - uint64_t min_log = 0; - - // we must look through the memtables for two phase transactions - // that have been committed but not yet flushed - for (auto loop_cfd : *versions_->GetColumnFamilySet()) { - if (loop_cfd->IsDropped()) { - continue; - } - - auto log = loop_cfd->imm()->GetMinLogContainingPrepSection(); - - if (log > 0 && (min_log == 0 || log < min_log)) { - min_log = log; - } - - log = loop_cfd->mem()->GetMinLogContainingPrepSection(); - - if (log > 0 && (min_log == 0 || log < min_log)) { - min_log = log; - } - } - - return min_log; -} - -void DBImpl::MarkLogAsHavingPrepSectionFlushed(uint64_t log) { - assert(log != 0); - std::lock_guard lock(prep_heap_mutex_); - auto it = prepared_section_completed_.find(log); - assert(it != prepared_section_completed_.end()); - it->second += 1; -} - -void DBImpl::MarkLogAsContainingPrepSection(uint64_t log) { - assert(log != 0); - std::lock_guard lock(prep_heap_mutex_); - min_log_with_prep_.push(log); - auto it = prepared_section_completed_.find(log); - if (it == prepared_section_completed_.end()) { - prepared_section_completed_[log] = 0; - } -} - -uint64_t DBImpl::FindMinLogContainingOutstandingPrep() { - - if (!allow_2pc()) { - return 0; - } - - std::lock_guard lock(prep_heap_mutex_); - uint64_t min_log = 0; - - // first we look in the prepared heap where we keep - // track of transactions that have been prepared (written to WAL) - // but not yet committed. - while (!min_log_with_prep_.empty()) { - min_log = min_log_with_prep_.top(); - - auto it = prepared_section_completed_.find(min_log); - - // value was marked as 'deleted' from heap - if (it != prepared_section_completed_.end() && it->second > 0) { - it->second -= 1; - min_log_with_prep_.pop(); - - // back to squere one... - min_log = 0; - continue; - } else { - // found a valid value - break; - } - } - - return min_log; -} - void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) { if (!job_context->logs_to_free.empty()) { for (auto l : job_context->logs_to_free) { @@ -748,4607 +467,1058 @@ void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) { } } -uint64_t DBImpl::MinLogNumberToKeep() { - uint64_t log_number = versions_->MinLogNumber(); - - if (allow_2pc()) { - // if are 2pc we must consider logs containing prepared - // sections of outstanding transactions. - // - // We must check min logs with outstanding prep before we check - // logs referneces by memtables because a log referenced by the - // first data structure could transition to the second under us. - // - // TODO(horuff): iterating over all column families under db mutex. - // should find more optimial solution - auto min_log_in_prep_heap = FindMinLogContainingOutstandingPrep(); - - if (min_log_in_prep_heap != 0 && min_log_in_prep_heap < log_number) { - log_number = min_log_in_prep_heap; - } - - auto min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable(); - - if (min_log_refed_by_mem != 0 && min_log_refed_by_mem < log_number) { - log_number = min_log_refed_by_mem; - } +Directory* DBImpl::Directories::GetDataDir(size_t path_id) { + assert(path_id < data_dirs_.size()); + Directory* ret_dir = data_dirs_[path_id].get(); + if (ret_dir == nullptr) { + // Should use db_dir_ + return db_dir_.get(); } - return log_number; + return ret_dir; } -// * Returns the list of live files in 'sst_live' -// If it's doing full scan: -// * Returns the list of all files in the filesystem in -// 'full_scan_candidate_files'. -// Otherwise, gets obsolete files from VersionSet. -// no_full_scan = true -- never do the full scan using GetChildren() -// force = false -- don't force the full scan, except every -// mutable_db_options_.delete_obsolete_files_period_micros -// force = true -- force the full scan -void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, - bool no_full_scan) { - mutex_.AssertHeld(); - - // if deletion is disabled, do nothing - if (disable_delete_obsolete_files_ > 0) { - return; +Status DBImpl::SetOptions(ColumnFamilyHandle* column_family, + const std::unordered_map& options_map) { +#ifdef ROCKSDB_LITE + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + auto* cfd = reinterpret_cast(column_family)->cfd(); + if (options_map.empty()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "SetOptions() on column family [%s], empty input", + cfd->GetName().c_str()); + return Status::InvalidArgument("empty input"); } - bool doing_the_full_scan = false; + MutableCFOptions new_options; + Status s; + Status persist_options_status; + WriteThread::Writer w; + { + InstrumentedMutexLock l(&mutex_); + s = cfd->SetOptions(options_map); + if (s.ok()) { + new_options = *cfd->GetLatestMutableCFOptions(); + // Append new version to recompute compaction score. + VersionEdit dummy_edit; + versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); + // Trigger possible flush/compactions. This has to be before we persist + // options to file, otherwise there will be a deadlock with writer + // thread. + auto* old_sv = + InstallSuperVersionAndScheduleWork(cfd, nullptr, new_options); + delete old_sv; - // logic for figurint out if we're doing the full scan - if (no_full_scan) { - doing_the_full_scan = false; - } else if (force || - mutable_db_options_.delete_obsolete_files_period_micros == 0) { - doing_the_full_scan = true; - } else { - const uint64_t now_micros = env_->NowMicros(); - if ((delete_obsolete_files_last_run_ + - mutable_db_options_.delete_obsolete_files_period_micros) < - now_micros) { - doing_the_full_scan = true; - delete_obsolete_files_last_run_ = now_micros; + write_thread_.EnterUnbatched(&w, &mutex_); + persist_options_status = WriteOptionsFile(); + write_thread_.ExitUnbatched(&w); } } - // don't delete files that might be currently written to from compaction - // threads - // Since job_context->min_pending_output is set, until file scan finishes, - // mutex_ cannot be released. Otherwise, we might see no min_pending_output - // here but later find newer generated unfinalized files while scannint. - if (!pending_outputs_.empty()) { - job_context->min_pending_output = *pending_outputs_.begin(); - } else { - // delete all of them - job_context->min_pending_output = std::numeric_limits::max(); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "SetOptions() on column family [%s], inputs:", + cfd->GetName().c_str()); + for (const auto& o : options_map) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(), + o.second.c_str()); } - - // Get obsolete files. This function will also update the list of - // pending files in VersionSet(). - versions_->GetObsoleteFiles(&job_context->sst_delete_files, - &job_context->manifest_delete_files, - job_context->min_pending_output); - - // store the current filenum, lognum, etc - job_context->manifest_file_number = versions_->manifest_file_number(); - job_context->pending_manifest_file_number = - versions_->pending_manifest_file_number(); - job_context->log_number = MinLogNumberToKeep(); - - job_context->prev_log_number = versions_->prev_log_number(); - - versions_->AddLiveFiles(&job_context->sst_live); - if (doing_the_full_scan) { - for (size_t path_id = 0; path_id < immutable_db_options_.db_paths.size(); - path_id++) { - // set of all files in the directory. We'll exclude files that are still - // alive in the subsequent processings. - std::vector files; - env_->GetChildren(immutable_db_options_.db_paths[path_id].path, - &files); // Ignore errors - for (std::string file : files) { - // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes - job_context->full_scan_candidate_files.emplace_back( - "/" + file, static_cast(path_id)); + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] SetOptions() succeeded", cfd->GetName().c_str()); + new_options.Dump(immutable_db_options_.info_log.get()); + if (!persist_options_status.ok()) { + if (immutable_db_options_.fail_if_options_file_error) { + s = Status::IOError( + "SetOptions() succeeded, but unable to persist options", + persist_options_status.ToString()); } + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to persist options in SetOptions() -- %s", + persist_options_status.ToString().c_str()); } + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed", + cfd->GetName().c_str()); + } + LogFlush(immutable_db_options_.info_log); + return s; +#endif // ROCKSDB_LITE +} - // Add log files in wal_dir - if (immutable_db_options_.wal_dir != dbname_) { - std::vector log_files; - env_->GetChildren(immutable_db_options_.wal_dir, - &log_files); // Ignore errors - for (std::string log_file : log_files) { - job_context->full_scan_candidate_files.emplace_back(log_file, 0); - } - } - // Add info log files in db_log_dir - if (!immutable_db_options_.db_log_dir.empty() && - immutable_db_options_.db_log_dir != dbname_) { - std::vector info_log_files; - // Ignore errors - env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files); - for (std::string log_file : info_log_files) { - job_context->full_scan_candidate_files.emplace_back(log_file, 0); - } - } +Status DBImpl::SetDBOptions( + const std::unordered_map& options_map) { +#ifdef ROCKSDB_LITE + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + if (options_map.empty()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "SetDBOptions(), empty input."); + return Status::InvalidArgument("empty input"); } - // logs_ is empty when called during recovery, in which case there can't yet - // be any tracked obsolete logs - if (!alive_log_files_.empty() && !logs_.empty()) { - uint64_t min_log_number = job_context->log_number; - size_t num_alive_log_files = alive_log_files_.size(); - // find newly obsoleted log files - while (alive_log_files_.begin()->number < min_log_number) { - auto& earliest = *alive_log_files_.begin(); - if (immutable_db_options_.recycle_log_file_num > - log_recycle_files.size()) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "adding log %" PRIu64 " to recycle list\n", - earliest.number); - log_recycle_files.push_back(earliest.number); - } else { - job_context->log_delete_files.push_back(earliest.number); - } - if (job_context->size_log_to_delete == 0) { - job_context->prev_total_log_size = total_log_size_; - job_context->num_alive_log_files = num_alive_log_files; + MutableDBOptions new_options; + Status s; + Status persist_options_status; + WriteThread::Writer w; + WriteContext write_context; + { + InstrumentedMutexLock l(&mutex_); + s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map, + &new_options); + if (s.ok()) { + if (new_options.max_background_compactions > + mutable_db_options_.max_background_compactions) { + env_->IncBackgroundThreadsIfNeeded( + new_options.max_background_compactions, Env::Priority::LOW); + MaybeScheduleFlushOrCompaction(); } - job_context->size_log_to_delete += earliest.size; - total_log_size_ -= earliest.size; - alive_log_files_.pop_front(); - // Current log should always stay alive since it can't have - // number < MinLogNumber(). - assert(alive_log_files_.size()); - } - while (!logs_.empty() && logs_.front().number < min_log_number) { - auto& log = logs_.front(); - if (log.getting_synced) { - log_sync_cv_.Wait(); - // logs_ could have changed while we were waiting. - continue; + + write_controller_.set_max_delayed_write_rate(new_options.delayed_write_rate); + + mutable_db_options_ = new_options; + + write_thread_.EnterUnbatched(&w, &mutex_); + if (total_log_size_ > GetMaxTotalWalSize()) { + Status purge_wal_status = HandleWALFull(&write_context); + if (!purge_wal_status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to purge WAL files in SetDBOptions() -- %s", + purge_wal_status.ToString().c_str()); + } } - logs_to_free_.push_back(log.ReleaseWriter()); - logs_.pop_front(); + persist_options_status = WriteOptionsFile(); + write_thread_.ExitUnbatched(&w); } - // Current log cannot be obsolete. - assert(!logs_.empty()); } - - // We're just cleaning up for DB::Write(). - assert(job_context->logs_to_free.empty()); - job_context->logs_to_free = logs_to_free_; - job_context->log_recycle_files.assign(log_recycle_files.begin(), - log_recycle_files.end()); - logs_to_free_.clear(); -} - -namespace { -bool CompareCandidateFile(const JobContext::CandidateFileInfo& first, - const JobContext::CandidateFileInfo& second) { - if (first.file_name > second.file_name) { - return true; - } else if (first.file_name < second.file_name) { - return false; - } else { - return (first.path_id > second.path_id); - } -} -}; // namespace - -// Delete obsolete files and log status and information of file deletion -void DBImpl::DeleteObsoleteFileImpl(Status file_deletion_status, int job_id, - const std::string& fname, FileType type, - uint64_t number, uint32_t path_id) { - if (type == kTableFile) { - file_deletion_status = - DeleteSSTFile(&immutable_db_options_, fname, path_id); - } else { - file_deletion_status = env_->DeleteFile(fname); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:"); + for (const auto& o : options_map) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(), + o.second.c_str()); } - if (file_deletion_status.ok()) { - ROCKS_LOG_DEBUG(immutable_db_options_.info_log, - "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id, - fname.c_str(), type, number, - file_deletion_status.ToString().c_str()); - } else if (env_->FileExists(fname).IsNotFound()) { - ROCKS_LOG_INFO( - immutable_db_options_.info_log, - "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64 - " -- %s\n", - job_id, fname.c_str(), type, number, - file_deletion_status.ToString().c_str()); + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded"); + new_options.Dump(immutable_db_options_.info_log.get()); + if (!persist_options_status.ok()) { + if (immutable_db_options_.fail_if_options_file_error) { + s = Status::IOError( + "SetDBOptions() succeeded, but unable to persist options", + persist_options_status.ToString()); + } + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to persist options in SetDBOptions() -- %s", + persist_options_status.ToString().c_str()); + } } else { - ROCKS_LOG_ERROR(immutable_db_options_.info_log, - "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n", - job_id, fname.c_str(), type, number, - file_deletion_status.ToString().c_str()); - } - if (type == kTableFile) { - EventHelpers::LogAndNotifyTableFileDeletion( - &event_logger_, job_id, number, fname, file_deletion_status, GetName(), - immutable_db_options_.listeners); + ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed"); } + LogFlush(immutable_db_options_.info_log); + return s; +#endif // ROCKSDB_LITE } -// Diffs the files listed in filenames and those that do not -// belong to live files are posibly removed. Also, removes all the -// files in sst_delete_files and log_delete_files. -// It is not necessary to hold the mutex when invoking this method. -void DBImpl::PurgeObsoleteFiles(const JobContext& state, bool schedule_only) { - // we'd better have sth to delete - assert(state.HaveSomethingToDelete()); - - // this checks if FindObsoleteFiles() was run before. If not, don't do - // PurgeObsoleteFiles(). If FindObsoleteFiles() was run, we need to also - // run PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true - if (state.manifest_file_number == 0) { - return; - } - - // Now, convert live list to an unordered map, WITHOUT mutex held; - // set is slow. - std::unordered_map sst_live_map; - for (const FileDescriptor& fd : state.sst_live) { - sst_live_map[fd.GetNumber()] = &fd; - } - std::unordered_set log_recycle_files_set( - state.log_recycle_files.begin(), state.log_recycle_files.end()); - - auto candidate_files = state.full_scan_candidate_files; - candidate_files.reserve( - candidate_files.size() + state.sst_delete_files.size() + - state.log_delete_files.size() + state.manifest_delete_files.size()); - // We may ignore the dbname when generating the file names. - const char* kDumbDbName = ""; - for (auto file : state.sst_delete_files) { - candidate_files.emplace_back( - MakeTableFileName(kDumbDbName, file->fd.GetNumber()), - file->fd.GetPathId()); - delete file; - } - - for (auto file_num : state.log_delete_files) { - if (file_num > 0) { - candidate_files.emplace_back(LogFileName(kDumbDbName, file_num), 0); +// return the same level if it cannot be moved +int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, + const MutableCFOptions& mutable_cf_options, int level) { + mutex_.AssertHeld(); + const auto* vstorage = cfd->current()->storage_info(); + int minimum_level = level; + for (int i = level - 1; i > 0; --i) { + // stop if level i is not empty + if (vstorage->NumLevelFiles(i) > 0) break; + // stop if level i is too small (cannot fit the level files) + if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) { + break; } - } - for (const auto& filename : state.manifest_delete_files) { - candidate_files.emplace_back(filename, 0); - } - - // dedup state.candidate_files so we don't try to delete the same - // file twice - std::sort(candidate_files.begin(), candidate_files.end(), - CompareCandidateFile); - candidate_files.erase( - std::unique(candidate_files.begin(), candidate_files.end()), - candidate_files.end()); - if (state.prev_total_log_size > 0) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "[JOB %d] Try to delete WAL files size %" PRIu64 - ", prev total WAL file size %" PRIu64 - ", number of live WAL files %" ROCKSDB_PRIszt ".\n", - state.job_id, state.size_log_to_delete, - state.prev_total_log_size, state.num_alive_log_files); + minimum_level = i; } + return minimum_level; +} - std::vector old_info_log_files; - InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(), - dbname_); - for (const auto& candidate_file : candidate_files) { - std::string to_delete = candidate_file.file_name; - uint32_t path_id = candidate_file.path_id; - uint64_t number; - FileType type; - // Ignore file if we cannot recognize it. - if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) { - continue; - } +Status DBImpl::SyncWAL() { + autovector logs_to_sync; + bool need_log_dir_sync; + uint64_t current_log_number; - bool keep = true; - switch (type) { - case kLogFile: - keep = ((number >= state.log_number) || - (number == state.prev_log_number) || - (log_recycle_files_set.find(number) != - log_recycle_files_set.end())); - break; - case kDescriptorFile: - // Keep my manifest file, and any newer incarnations' - // (can happen during manifest roll) - keep = (number >= state.manifest_file_number); - break; - case kTableFile: - // If the second condition is not there, this makes - // DontDeletePendingOutputs fail - keep = (sst_live_map.find(number) != sst_live_map.end()) || - number >= state.min_pending_output; - break; - case kTempFile: - // Any temp files that are currently being written to must - // be recorded in pending_outputs_, which is inserted into "live". - // Also, SetCurrentFile creates a temp file when writing out new - // manifest, which is equal to state.pending_manifest_file_number. We - // should not delete that file - // - // TODO(yhchiang): carefully modify the third condition to safely - // remove the temp options files. - keep = (sst_live_map.find(number) != sst_live_map.end()) || - (number == state.pending_manifest_file_number) || - (to_delete.find(kOptionsFileNamePrefix) != std::string::npos); - break; - case kInfoLogFile: - keep = true; - if (number != 0) { - old_info_log_files.push_back(to_delete); - } - break; - case kCurrentFile: - case kDBLockFile: - case kIdentityFile: - case kMetaDatabase: - case kOptionsFile: - keep = true; - break; - } + { + InstrumentedMutexLock l(&mutex_); + assert(!logs_.empty()); - if (keep) { - continue; - } + // This SyncWAL() call only cares about logs up to this number. + current_log_number = logfile_number_; - std::string fname; - if (type == kTableFile) { - // evict from cache - TableCache::Evict(table_cache_.get(), number); - fname = TableFileName(immutable_db_options_.db_paths, number, path_id); - } else { - fname = ((type == kLogFile) ? immutable_db_options_.wal_dir : dbname_) + - "/" + to_delete; + while (logs_.front().number <= current_log_number && + logs_.front().getting_synced) { + log_sync_cv_.Wait(); } - -#ifndef ROCKSDB_LITE - if (type == kLogFile && (immutable_db_options_.wal_ttl_seconds > 0 || - immutable_db_options_.wal_size_limit_mb > 0)) { - wal_manager_.ArchiveWALFile(fname, number); - continue; + // First check that logs are safe to sync in background. + for (auto it = logs_.begin(); + it != logs_.end() && it->number <= current_log_number; ++it) { + if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) { + return Status::NotSupported( + "SyncWAL() is not supported for this implementation of WAL file", + immutable_db_options_.allow_mmap_writes + ? "try setting Options::allow_mmap_writes to false" + : Slice()); + } } -#endif // !ROCKSDB_LITE - - Status file_deletion_status; - if (schedule_only) { - InstrumentedMutexLock guard_lock(&mutex_); - SchedulePendingPurge(fname, type, number, path_id, state.job_id); - } else { - DeleteObsoleteFileImpl(file_deletion_status, state.job_id, fname, type, - number, path_id); + for (auto it = logs_.begin(); + it != logs_.end() && it->number <= current_log_number; ++it) { + auto& log = *it; + assert(!log.getting_synced); + log.getting_synced = true; + logs_to_sync.push_back(log.writer); } + + need_log_dir_sync = !log_dir_synced_; } - // Delete old info log files. - size_t old_info_log_file_count = old_info_log_files.size(); - if (old_info_log_file_count != 0 && - old_info_log_file_count >= immutable_db_options_.keep_log_file_num) { - std::sort(old_info_log_files.begin(), old_info_log_files.end()); - size_t end = - old_info_log_file_count - immutable_db_options_.keep_log_file_num; - for (unsigned int i = 0; i <= end; i++) { - std::string& to_delete = old_info_log_files.at(i); - std::string full_path_to_delete = - (immutable_db_options_.db_log_dir.empty() - ? dbname_ - : immutable_db_options_.db_log_dir) + - "/" + to_delete; - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "[JOB %d] Delete info log file %s\n", state.job_id, - full_path_to_delete.c_str()); - Status s = env_->DeleteFile(full_path_to_delete); - if (!s.ok()) { - if (env_->FileExists(full_path_to_delete).IsNotFound()) { - ROCKS_LOG_INFO( - immutable_db_options_.info_log, - "[JOB %d] Tried to delete non-existing info log file %s FAILED " - "-- %s\n", - state.job_id, to_delete.c_str(), s.ToString().c_str()); - } else { - ROCKS_LOG_ERROR(immutable_db_options_.info_log, - "[JOB %d] Delete info log file %s FAILED -- %s\n", - state.job_id, to_delete.c_str(), - s.ToString().c_str()); - } - } + RecordTick(stats_, WAL_FILE_SYNCED); + Status status; + for (log::Writer* log : logs_to_sync) { + status = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync); + if (!status.ok()) { + break; } } -#ifndef ROCKSDB_LITE - wal_manager_.PurgeObsoleteWALFiles(); -#endif // ROCKSDB_LITE - LogFlush(immutable_db_options_.info_log); -} - -void DBImpl::DeleteObsoleteFiles() { - mutex_.AssertHeld(); - JobContext job_context(next_job_id_.fetch_add(1)); - FindObsoleteFiles(&job_context, true); - - mutex_.Unlock(); - if (job_context.HaveSomethingToDelete()) { - PurgeObsoleteFiles(job_context); + if (status.ok() && need_log_dir_sync) { + status = directories_.GetWalDir()->Fsync(); } - job_context.Clean(); - mutex_.Lock(); -} -Status DBImpl::Directories::CreateAndNewDirectory( - Env* env, const std::string& dirname, - std::unique_ptr* directory) const { - // We call CreateDirIfMissing() as the directory may already exist (if we - // are reopening a DB), when this happens we don't want creating the - // directory to cause an error. However, we need to check if creating the - // directory fails or else we may get an obscure message about the lock - // file not existing. One real-world example of this occurring is if - // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. - // when dbname_ is "dir/db" but when "dir" doesn't exist. - Status s = env->CreateDirIfMissing(dirname); - if (!s.ok()) { - return s; + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); + { + InstrumentedMutexLock l(&mutex_); + MarkLogsSynced(current_log_number, need_log_dir_sync, status); } - return env->NewDirectory(dirname, directory); + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); + + return status; } -Status DBImpl::Directories::SetDirectories( - Env* env, const std::string& dbname, const std::string& wal_dir, - const std::vector& data_paths) { - Status s = CreateAndNewDirectory(env, dbname, &db_dir_); - if (!s.ok()) { - return s; - } - if (!wal_dir.empty() && dbname != wal_dir) { - s = CreateAndNewDirectory(env, wal_dir, &wal_dir_); - if (!s.ok()) { - return s; - } +void DBImpl::MarkLogsSynced( + uint64_t up_to, bool synced_dir, const Status& status) { + mutex_.AssertHeld(); + if (synced_dir && + logfile_number_ == up_to && + status.ok()) { + log_dir_synced_ = true; } - - data_dirs_.clear(); - for (auto& p : data_paths) { - const std::string db_path = p.path; - if (db_path == dbname) { - data_dirs_.emplace_back(nullptr); + for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) { + auto& log = *it; + assert(log.getting_synced); + if (status.ok() && logs_.size() > 1) { + logs_to_free_.push_back(log.ReleaseWriter()); + it = logs_.erase(it); } else { - std::unique_ptr path_directory; - s = CreateAndNewDirectory(env, db_path, &path_directory); - if (!s.ok()) { - return s; - } - data_dirs_.emplace_back(path_directory.release()); + log.getting_synced = false; + ++it; } } - assert(data_dirs_.size() == data_paths.size()); - return Status::OK(); + assert(!status.ok() || logs_.empty() || logs_[0].number > up_to || + (logs_.size() == 1 && !logs_[0].getting_synced)); + log_sync_cv_.SignalAll(); } -Directory* DBImpl::Directories::GetDataDir(size_t path_id) { - assert(path_id < data_dirs_.size()); - Directory* ret_dir = data_dirs_[path_id].get(); - if (ret_dir == nullptr) { - // Should use db_dir_ - return db_dir_.get(); - } - return ret_dir; +SequenceNumber DBImpl::GetLatestSequenceNumber() const { + return versions_->LastSequence(); } -Status DBImpl::Recover( - const std::vector& column_families, bool read_only, - bool error_if_log_file_exist, bool error_if_data_exists_in_logs) { - mutex_.AssertHeld(); +InternalIterator* DBImpl::NewInternalIterator( + Arena* arena, RangeDelAggregator* range_del_agg, + ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } - bool is_new_db = false; - assert(db_lock_ == nullptr); - if (!read_only) { - Status s = directories_.SetDirectories(env_, dbname_, - immutable_db_options_.wal_dir, - immutable_db_options_.db_paths); - if (!s.ok()) { - return s; - } + mutex_.Lock(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + mutex_.Unlock(); + ReadOptions roptions; + return NewInternalIterator(roptions, cfd, super_version, arena, + range_del_agg); +} + +void DBImpl::SchedulePurge() { + mutex_.AssertHeld(); + assert(opened_successfully_); + + // Purge operations are put into High priority queue + bg_purge_scheduled_++; + env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr); +} + +void DBImpl::BackgroundCallPurge() { + mutex_.Lock(); - s = env_->LockFile(LockFileName(dbname_), &db_lock_); - if (!s.ok()) { - return s; - } + // We use one single loop to clear both queues so that after existing the loop + // both queues are empty. This is stricter than what is needed, but can make + // it easier for us to reason the correctness. + while (!purge_queue_.empty() || !logs_to_free_queue_.empty()) { + if (!purge_queue_.empty()) { + auto purge_file = purge_queue_.begin(); + auto fname = purge_file->fname; + auto type = purge_file->type; + auto number = purge_file->number; + auto path_id = purge_file->path_id; + auto job_id = purge_file->job_id; + purge_queue_.pop_front(); - s = env_->FileExists(CurrentFileName(dbname_)); - if (s.IsNotFound()) { - if (immutable_db_options_.create_if_missing) { - s = NewDB(); - is_new_db = true; - if (!s.ok()) { - return s; - } - } else { - return Status::InvalidArgument( - dbname_, "does not exist (create_if_missing is false)"); - } - } else if (s.ok()) { - if (immutable_db_options_.error_if_exists) { - return Status::InvalidArgument( - dbname_, "exists (error_if_exists is true)"); - } + mutex_.Unlock(); + Status file_deletion_status; + DeleteObsoleteFileImpl(file_deletion_status, job_id, fname, type, number, + path_id); + mutex_.Lock(); } else { - // Unexpected error reading file - assert(s.IsIOError()); - return s; - } - // Check for the IDENTITY file and create it if not there - s = env_->FileExists(IdentityFileName(dbname_)); - if (s.IsNotFound()) { - s = SetIdentityFile(env_, dbname_); - if (!s.ok()) { - return s; - } - } else if (!s.ok()) { - assert(s.IsIOError()); - return s; + assert(!logs_to_free_queue_.empty()); + log::Writer* log_writer = *(logs_to_free_queue_.begin()); + logs_to_free_queue_.pop_front(); + mutex_.Unlock(); + delete log_writer; + mutex_.Lock(); } } + bg_purge_scheduled_--; - Status s = versions_->Recover(column_families, read_only); - if (immutable_db_options_.paranoid_checks && s.ok()) { - s = CheckConsistency(); - } - if (s.ok()) { - SequenceNumber next_sequence(kMaxSequenceNumber); - default_cf_handle_ = new ColumnFamilyHandleImpl( - versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); - default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); - single_column_family_mode_ = - versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; - - // Recover from all newer log files than the ones named in the - // descriptor (new log files may have been added by the previous - // incarnation without registering them in the descriptor). - // - // Note that prev_log_number() is no longer used, but we pay - // attention to it in case we are recovering a database - // produced by an older version of rocksdb. - std::vector filenames; - s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames); - if (!s.ok()) { - return s; - } - - std::vector logs; - for (size_t i = 0; i < filenames.size(); i++) { - uint64_t number; - FileType type; - if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) { - if (is_new_db) { - return Status::Corruption( - "While creating a new Db, wal_dir contains " - "existing log file: ", - filenames[i]); - } else { - logs.push_back(number); - } - } - } + bg_cv_.SignalAll(); + // IMPORTANT:there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + mutex_.Unlock(); +} - if (logs.size() > 0) { - if (error_if_log_file_exist) { - return Status::Corruption( - "The db was opened in readonly mode with error_if_log_file_exist" - "flag but a log file already exists"); - } else if (error_if_data_exists_in_logs) { - for (auto& log : logs) { - std::string fname = LogFileName(immutable_db_options_.wal_dir, log); - uint64_t bytes; - s = env_->GetFileSize(fname, &bytes); - if (s.ok()) { - if (bytes > 0) { - return Status::Corruption( - "error_if_data_exists_in_logs is set but there are data " - " in log files."); - } - } - } - } - } +namespace { +struct IterState { + IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version, + bool _background_purge) + : db(_db), + mu(_mu), + super_version(_super_version), + background_purge(_background_purge) {} - if (!logs.empty()) { - // Recover in the order in which the logs were generated - std::sort(logs.begin(), logs.end()); - s = RecoverLogFiles(logs, &next_sequence, read_only); - if (!s.ok()) { - // Clear memtables if recovery failed - for (auto cfd : *versions_->GetColumnFamilySet()) { - cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), - kMaxSequenceNumber); - } - } - } - } + DBImpl* db; + InstrumentedMutex* mu; + SuperVersion* super_version; + bool background_purge; +}; - // Initial value - max_total_in_memory_state_ = 0; - for (auto cfd : *versions_->GetColumnFamilySet()) { - auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); - max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * - mutable_cf_options->max_write_buffer_number; - } +static void CleanupIteratorState(void* arg1, void* arg2) { + IterState* state = reinterpret_cast(arg1); - return s; -} + if (state->super_version->Unref()) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); -// REQUIRES: log_numbers are sorted in ascending order -Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, - SequenceNumber* next_sequence, bool read_only) { - struct LogReporter : public log::Reader::Reporter { - Env* env; - Logger* info_log; - const char* fname; - Status* status; // nullptr if immutable_db_options_.paranoid_checks==false - virtual void Corruption(size_t bytes, const Status& s) override { - ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", - (this->status == nullptr ? "(ignoring error) " : ""), - fname, static_cast(bytes), s.ToString().c_str()); - if (this->status != nullptr && this->status->ok()) { - *this->status = s; - } + state->mu->Lock(); + state->super_version->Cleanup(); + state->db->FindObsoleteFiles(&job_context, false, true); + if (state->background_purge) { + state->db->ScheduleBgLogWriterClose(&job_context); } - }; + state->mu->Unlock(); - mutex_.AssertHeld(); - Status status; - std::unordered_map version_edits; - // no need to refcount because iteration is under mutex - for (auto cfd : *versions_->GetColumnFamilySet()) { - VersionEdit edit; - edit.SetColumnFamily(cfd->GetID()); - version_edits.insert({cfd->GetID(), edit}); - } - int job_id = next_job_id_.fetch_add(1); - { - auto stream = event_logger_.Log(); - stream << "job" << job_id << "event" - << "recovery_started"; - stream << "log_files"; - stream.StartArray(); - for (auto log_number : log_numbers) { - stream << log_number; + delete state->super_version; + if (job_context.HaveSomethingToDelete()) { + if (state->background_purge) { + // PurgeObsoleteFiles here does not delete files. Instead, it adds the + // files to be deleted to a job queue, and deletes it in a separate + // background thread. + state->db->PurgeObsoleteFiles(job_context, true /* schedule only */); + state->mu->Lock(); + state->db->SchedulePurge(); + state->mu->Unlock(); + } else { + state->db->PurgeObsoleteFiles(job_context); + } } - stream.EndArray(); + job_context.Clean(); } -#ifndef ROCKSDB_LITE - if (immutable_db_options_.wal_filter != nullptr) { - std::map cf_name_id_map; - std::map cf_lognumber_map; - for (auto cfd : *versions_->GetColumnFamilySet()) { - cf_name_id_map.insert( - std::make_pair(cfd->GetName(), cfd->GetID())); - cf_lognumber_map.insert( - std::make_pair(cfd->GetID(), cfd->GetLogNumber())); - } + delete state; +} +} // namespace - immutable_db_options_.wal_filter->ColumnFamilyLogNumberMap(cf_lognumber_map, - cf_name_id_map); +InternalIterator* DBImpl::NewInternalIterator( + const ReadOptions& read_options, ColumnFamilyData* cfd, + SuperVersion* super_version, Arena* arena, + RangeDelAggregator* range_del_agg) { + InternalIterator* internal_iter; + assert(arena != nullptr); + assert(range_del_agg != nullptr); + // Need to create internal iterator from the arena. + MergeIteratorBuilder merge_iter_builder( + &cfd->internal_comparator(), arena, + !read_options.total_order_seek && + cfd->ioptions()->prefix_extractor != nullptr); + // Collect iterator for mutable mem + merge_iter_builder.AddIterator( + super_version->mem->NewIterator(read_options, arena)); + std::unique_ptr range_del_iter; + Status s; + if (!read_options.ignore_range_deletions) { + range_del_iter.reset( + super_version->mem->NewRangeTombstoneIterator(read_options)); + s = range_del_agg->AddTombstones(std::move(range_del_iter)); } -#endif - - bool stop_replay_by_wal_filter = false; - bool stop_replay_for_corruption = false; - bool flushed = false; - for (auto log_number : log_numbers) { - // The previous incarnation may not have written any MANIFEST - // records after allocating this log number. So we manually - // update the file number allocation counter in VersionSet. - versions_->MarkFileNumberUsedDuringRecovery(log_number); - // Open the log file - std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); - - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "Recovering log #%" PRIu64 " mode %d", log_number, - immutable_db_options_.wal_recovery_mode); - auto logFileDropped = [this, &fname]() { - uint64_t bytes; - if (env_->GetFileSize(fname, &bytes).ok()) { - auto info_log = immutable_db_options_.info_log.get(); - ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(), - static_cast(bytes)); - } - }; - if (stop_replay_by_wal_filter) { - logFileDropped(); - continue; + // Collect all needed child iterators for immutable memtables + if (s.ok()) { + super_version->imm->AddIterators(read_options, &merge_iter_builder); + if (!read_options.ignore_range_deletions) { + s = super_version->imm->AddRangeTombstoneIterators(read_options, arena, + range_del_agg); } - - unique_ptr file_reader; - { - unique_ptr file; - status = env_->NewSequentialFile(fname, &file, env_options_); - if (!status.ok()) { - MaybeIgnoreError(&status); - if (!status.ok()) { - return status; - } else { - // Fail with one log file, but that's ok. - // Try next one. - continue; - } - } - file_reader.reset(new SequentialFileReader(std::move(file))); + } + if (s.ok()) { + // Collect iterators for files in L0 - Ln + if (read_options.read_tier != kMemtableTier) { + super_version->current->AddIterators(read_options, env_options_, + &merge_iter_builder, range_del_agg); } + internal_iter = merge_iter_builder.Finish(); + IterState* cleanup = + new IterState(this, &mutex_, super_version, + read_options.background_purge_on_iterator_cleanup); + internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); - // Create the log reader. - LogReporter reporter; - reporter.env = env_; - reporter.info_log = immutable_db_options_.info_log.get(); - reporter.fname = fname.c_str(); - if (!immutable_db_options_.paranoid_checks || - immutable_db_options_.wal_recovery_mode == - WALRecoveryMode::kSkipAnyCorruptedRecords) { - reporter.status = nullptr; - } else { - reporter.status = &status; - } - // We intentially make log::Reader do checksumming even if - // paranoid_checks==false so that corruptions cause entire commits - // to be skipped instead of propagating bad information (like overly - // large sequence numbers). - log::Reader reader(immutable_db_options_.info_log, std::move(file_reader), - &reporter, true /*checksum*/, 0 /*initial_offset*/, - log_number); - - // Determine if we should tolerate incomplete records at the tail end of the - // Read all the records and add to a memtable - std::string scratch; - Slice record; - WriteBatch batch; - - while (!stop_replay_by_wal_filter && - reader.ReadRecord(&record, &scratch, - immutable_db_options_.wal_recovery_mode) && - status.ok()) { - if (record.size() < WriteBatchInternal::kHeader) { - reporter.Corruption(record.size(), - Status::Corruption("log record too small")); - continue; - } - WriteBatchInternal::SetContents(&batch, record); - SequenceNumber sequence = WriteBatchInternal::Sequence(&batch); - - if (immutable_db_options_.wal_recovery_mode == - WALRecoveryMode::kPointInTimeRecovery) { - // In point-in-time recovery mode, if sequence id of log files are - // consecutive, we continue recovery despite corruption. This could - // happen when we open and write to a corrupted DB, where sequence id - // will start from the last sequence id we recovered. - if (sequence == *next_sequence) { - stop_replay_for_corruption = false; - } - if (stop_replay_for_corruption) { - logFileDropped(); - break; - } - } + return internal_iter; + } + return NewErrorInternalIterator(s); +} -#ifndef ROCKSDB_LITE - if (immutable_db_options_.wal_filter != nullptr) { - WriteBatch new_batch; - bool batch_changed = false; - - WalFilter::WalProcessingOption wal_processing_option = - immutable_db_options_.wal_filter->LogRecordFound( - log_number, fname, batch, &new_batch, &batch_changed); - - switch (wal_processing_option) { - case WalFilter::WalProcessingOption::kContinueProcessing: - // do nothing, proceeed normally - break; - case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: - // skip current record - continue; - case WalFilter::WalProcessingOption::kStopReplay: - // skip current record and stop replay - stop_replay_by_wal_filter = true; - continue; - case WalFilter::WalProcessingOption::kCorruptedRecord: { - status = - Status::Corruption("Corruption reported by Wal Filter ", - immutable_db_options_.wal_filter->Name()); - MaybeIgnoreError(&status); - if (!status.ok()) { - reporter.Corruption(record.size(), status); - continue; - } - break; - } - default: { - assert(false); // unhandled case - status = Status::NotSupported( - "Unknown WalProcessingOption returned" - " by Wal Filter ", - immutable_db_options_.wal_filter->Name()); - MaybeIgnoreError(&status); - if (!status.ok()) { - return status; - } else { - // Ignore the error with current record processing. - continue; - } - } - } +ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { + return default_cf_handle_; +} - if (batch_changed) { - // Make sure that the count in the new batch is - // within the orignal count. - int new_count = WriteBatchInternal::Count(&new_batch); - int original_count = WriteBatchInternal::Count(&batch); - if (new_count > original_count) { - ROCKS_LOG_FATAL( - immutable_db_options_.info_log, - "Recovering log #%" PRIu64 - " mode %d log filter %s returned " - "more records (%d) than original (%d) which is not allowed. " - "Aborting recovery.", - log_number, immutable_db_options_.wal_recovery_mode, - immutable_db_options_.wal_filter->Name(), new_count, - original_count); - status = Status::NotSupported( - "More than original # of records " - "returned by Wal Filter ", - immutable_db_options_.wal_filter->Name()); - return status; - } - // Set the same sequence number in the new_batch - // as the original batch. - WriteBatchInternal::SetSequence(&new_batch, - WriteBatchInternal::Sequence(&batch)); - batch = new_batch; - } - } -#endif // ROCKSDB_LITE +Status DBImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + return GetImpl(read_options, column_family, key, value); +} - // If column family was not found, it might mean that the WAL write - // batch references to the column family that was dropped after the - // insert. We don't want to fail the whole write batch in that case -- - // we just ignore the update. - // That's why we set ignore missing column families to true - bool has_valid_writes = false; - status = WriteBatchInternal::InsertInto( - &batch, column_family_memtables_.get(), &flush_scheduler_, true, - log_number, this, false /* concurrent_memtable_writes */, - next_sequence, &has_valid_writes); - MaybeIgnoreError(&status); - if (!status.ok()) { - // We are treating this as a failure while reading since we read valid - // blocks that do not form coherent data - reporter.Corruption(record.size(), status); - continue; - } +Status DBImpl::GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val, bool* value_found) { + assert(pinnable_val != nullptr); + StopWatch sw(env_, stats_, DB_GET); + PERF_TIMER_GUARD(get_snapshot_time); - if (has_valid_writes && !read_only) { - // we can do this because this is called before client has access to the - // DB and there is only a single thread operating on DB - ColumnFamilyData* cfd; - - while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { - cfd->Unref(); - // If this asserts, it means that InsertInto failed in - // filtering updates to already-flushed column families - assert(cfd->GetLogNumber() <= log_number); - auto iter = version_edits.find(cfd->GetID()); - assert(iter != version_edits.end()); - VersionEdit* edit = &iter->second; - status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); - if (!status.ok()) { - // Reflect errors immediately so that conditions like full - // file-systems cause the DB::Open() to fail. - return status; - } - flushed = true; + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); - cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), - *next_sequence); - } - } - } + // Acquire SuperVersion + SuperVersion* sv = GetAndRefSuperVersion(cfd); - if (!status.ok()) { - if (immutable_db_options_.wal_recovery_mode == - WALRecoveryMode::kSkipAnyCorruptedRecords) { - // We should ignore all errors unconditionally - status = Status::OK(); - } else if (immutable_db_options_.wal_recovery_mode == - WALRecoveryMode::kPointInTimeRecovery) { - // We should ignore the error but not continue replaying - status = Status::OK(); - stop_replay_for_corruption = true; - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "Point in time recovered to log #%" PRIu64 - " seq #%" PRIu64, - log_number, *next_sequence); - } else { - assert(immutable_db_options_.wal_recovery_mode == - WALRecoveryMode::kTolerateCorruptedTailRecords || - immutable_db_options_.wal_recovery_mode == - WALRecoveryMode::kAbsoluteConsistency); - return status; - } - } + TEST_SYNC_POINT("DBImpl::GetImpl:1"); + TEST_SYNC_POINT("DBImpl::GetImpl:2"); - flush_scheduler_.Clear(); - auto last_sequence = *next_sequence - 1; - if ((*next_sequence != kMaxSequenceNumber) && - (versions_->LastSequence() <= last_sequence)) { - versions_->SetLastSequence(last_sequence); - } + SequenceNumber snapshot; + if (read_options.snapshot != nullptr) { + snapshot = reinterpret_cast( + read_options.snapshot)->number_; + } else { + // Since we get and reference the super version before getting + // the snapshot number, without a mutex protection, it is possible + // that a memtable switch happened in the middle and not all the + // data for this snapshot is available. But it will contain all + // the data available in the super version we have, which is also + // a valid snapshot to read from. + // We shouldn't get snapshot before finding and referencing the + // super versipon because a flush happening in between may compact + // away data for the snapshot, but the snapshot is earlier than the + // data overwriting it, so users may see wrong results. + snapshot = versions_->LastSequence(); } + TEST_SYNC_POINT("DBImpl::GetImpl:3"); + TEST_SYNC_POINT("DBImpl::GetImpl:4"); - // True if there's any data in the WALs; if not, we can skip re-processing - // them later - bool data_seen = false; - if (!read_only) { - // no need to refcount since client still doesn't have access - // to the DB and can not drop column families while we iterate - auto max_log_number = log_numbers.back(); - for (auto cfd : *versions_->GetColumnFamilySet()) { - auto iter = version_edits.find(cfd->GetID()); - assert(iter != version_edits.end()); - VersionEdit* edit = &iter->second; - - if (cfd->GetLogNumber() > max_log_number) { - // Column family cfd has already flushed the data - // from all logs. Memtable has to be empty because - // we filter the updates based on log_number - // (in WriteBatch::InsertInto) - assert(cfd->mem()->GetFirstSequenceNumber() == 0); - assert(edit->NumEntries() == 0); - continue; - } - - // flush the final memtable (if non-empty) - if (cfd->mem()->GetFirstSequenceNumber() != 0) { - // If flush happened in the middle of recovery (e.g. due to memtable - // being full), we flush at the end. Otherwise we'll need to record - // where we were on last flush, which make the logic complicated. - if (flushed || !immutable_db_options_.avoid_flush_during_recovery) { - status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); - if (!status.ok()) { - // Recovery failed - break; - } - flushed = true; + // Prepare to store a list of merge operations if merge occurs. + MergeContext merge_context; + RangeDelAggregator range_del_agg(cfd->internal_comparator(), snapshot); - cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), - versions_->LastSequence()); - } - data_seen = true; - } + Status s; + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + LookupKey lkey(key, snapshot); + PERF_TIMER_STOP(get_snapshot_time); - // write MANIFEST with update - // writing log_number in the manifest means that any log file - // with number strongly less than (log_number + 1) is already - // recovered and should be ignored on next reincarnation. - // Since we already recovered max_log_number, we want all logs - // with numbers `<= max_log_number` (includes this one) to be ignored - if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) { - edit->SetLogNumber(max_log_number + 1); - } - // we must mark the next log number as used, even though it's - // not actually used. that is because VersionSet assumes - // VersionSet::next_file_number_ always to be strictly greater than any - // log number - versions_->MarkFileNumberUsedDuringRecovery(max_log_number + 1); - status = versions_->LogAndApply( - cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_); - if (!status.ok()) { - // Recovery failed - break; - } + bool skip_memtable = (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + bool done = false; + if (!skip_memtable) { + if (sv->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, + &range_del_agg, read_options)) { + done = true; + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + sv->imm->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, + &range_del_agg, read_options)) { + done = true; + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); } - } - - if (data_seen && !flushed) { - // Mark these as alive so they'll be considered for deletion later by - // FindObsoleteFiles() - for (auto log_number : log_numbers) { - alive_log_files_.push_back(LogFileNumberSize(log_number)); + if (!done && !s.ok() && !s.IsMergeInProgress()) { + return s; } } + if (!done) { + PERF_TIMER_GUARD(get_from_output_files_time); + sv->current->Get(read_options, lkey, pinnable_val, &s, &merge_context, + &range_del_agg, value_found); + RecordTick(stats_, MEMTABLE_MISS); + } - event_logger_.Log() << "job" << job_id << "event" - << "recovery_finished"; - - return status; -} - -Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, - MemTable* mem, VersionEdit* edit) { - mutex_.AssertHeld(); - const uint64_t start_micros = env_->NowMicros(); - FileMetaData meta; - auto pending_outputs_inserted_elem = - CaptureCurrentFileNumberInPendingOutputs(); - meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); - ReadOptions ro; - ro.total_order_seek = true; - Arena arena; - Status s; - TableProperties table_properties; { - ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); - ROCKS_LOG_DEBUG(immutable_db_options_.info_log, - "[%s] [WriteLevel0TableForRecovery]" - " Level-0 table #%" PRIu64 ": started", - cfd->GetName().c_str(), meta.fd.GetNumber()); - - // Get the latest mutable cf options while the mutex is still locked - const MutableCFOptions mutable_cf_options = - *cfd->GetLatestMutableCFOptions(); - bool paranoid_file_checks = - cfd->GetLatestMutableCFOptions()->paranoid_file_checks; - { - mutex_.Unlock(); + PERF_TIMER_GUARD(get_post_process_time); - SequenceNumber earliest_write_conflict_snapshot; - std::vector snapshot_seqs = - snapshots_.GetAll(&earliest_write_conflict_snapshot); - - s = BuildTable( - dbname_, env_, *cfd->ioptions(), mutable_cf_options, env_options_, - cfd->table_cache(), iter.get(), - std::unique_ptr(mem->NewRangeTombstoneIterator(ro)), - &meta, cfd->internal_comparator(), - cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(), - snapshot_seqs, earliest_write_conflict_snapshot, - GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), - cfd->ioptions()->compression_opts, paranoid_file_checks, - cfd->internal_stats(), TableFileCreationReason::kRecovery, - &event_logger_, job_id); - LogFlush(immutable_db_options_.info_log); - ROCKS_LOG_DEBUG(immutable_db_options_.info_log, - "[%s] [WriteLevel0TableForRecovery]" - " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", - cfd->GetName().c_str(), meta.fd.GetNumber(), - meta.fd.GetFileSize(), s.ToString().c_str()); - mutex_.Lock(); - } - } - ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); - - // Note that if file_size is zero, the file has been deleted and - // should not be added to the manifest. - int level = 0; - if (s.ok() && meta.fd.GetFileSize() > 0) { - edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), - meta.fd.GetFileSize(), meta.smallest, meta.largest, - meta.smallest_seqno, meta.largest_seqno, - meta.marked_for_compaction); - } + ReturnAndCleanupSuperVersion(cfd, sv); - InternalStats::CompactionStats stats(1); - stats.micros = env_->NowMicros() - start_micros; - stats.bytes_written = meta.fd.GetFileSize(); - stats.num_output_files = 1; - cfd->internal_stats()->AddCompactionStats(level, stats); - cfd->internal_stats()->AddCFStats( - InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize()); - RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = pinnable_val->size(); + RecordTick(stats_, BYTES_READ, size); + MeasureTime(stats_, BYTES_PER_READ, size); + } return s; } -Status DBImpl::SyncClosedLogs(JobContext* job_context) { - TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start"); - mutex_.AssertHeld(); - autovector logs_to_sync; - uint64_t current_log_number = logfile_number_; - while (logs_.front().number < current_log_number && - logs_.front().getting_synced) { - log_sync_cv_.Wait(); - } - for (auto it = logs_.begin(); - it != logs_.end() && it->number < current_log_number; ++it) { - auto& log = *it; - assert(!log.getting_synced); - log.getting_synced = true; - logs_to_sync.push_back(log.writer); - } - - Status s; - if (!logs_to_sync.empty()) { - mutex_.Unlock(); +std::vector DBImpl::MultiGet( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { - for (log::Writer* log : logs_to_sync) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "[JOB %d] Syncing log #%" PRIu64, job_context->job_id, - log->get_log_number()); - s = log->file()->Sync(immutable_db_options_.use_fsync); - } - if (s.ok()) { - s = directories_.GetWalDir()->Fsync(); - } + StopWatch sw(env_, stats_, DB_MULTIGET); + PERF_TIMER_GUARD(get_snapshot_time); - mutex_.Lock(); + SequenceNumber snapshot; - // "number <= current_log_number - 1" is equivalent to - // "number < current_log_number". - MarkLogsSynced(current_log_number - 1, true, s); - if (!s.ok()) { - bg_error_ = s; - TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed"); - return s; + struct MultiGetColumnFamilyData { + ColumnFamilyData* cfd; + SuperVersion* super_version; + }; + std::unordered_map multiget_cf_data; + // fill up and allocate outside of mutex + for (auto cf : column_family) { + auto cfh = reinterpret_cast(cf); + auto cfd = cfh->cfd(); + if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) { + auto mgcfd = new MultiGetColumnFamilyData(); + mgcfd->cfd = cfd; + multiget_cf_data.insert({cfd->GetID(), mgcfd}); } } - return s; -} - -Status DBImpl::FlushMemTableToOutputFile( - ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, - bool* made_progress, JobContext* job_context, LogBuffer* log_buffer) { - mutex_.AssertHeld(); - assert(cfd->imm()->NumNotFlushed() != 0); - assert(cfd->imm()->IsFlushPending()); - SequenceNumber earliest_write_conflict_snapshot; - std::vector snapshot_seqs = - snapshots_.GetAll(&earliest_write_conflict_snapshot); + mutex_.Lock(); + if (read_options.snapshot != nullptr) { + snapshot = reinterpret_cast( + read_options.snapshot)->number_; + } else { + snapshot = versions_->LastSequence(); + } + for (auto mgd_iter : multiget_cf_data) { + mgd_iter.second->super_version = + mgd_iter.second->cfd->GetSuperVersion()->Ref(); + } + mutex_.Unlock(); - FlushJob flush_job( - dbname_, cfd, immutable_db_options_, mutable_cf_options, env_options_, - versions_.get(), &mutex_, &shutting_down_, snapshot_seqs, - earliest_write_conflict_snapshot, job_context, log_buffer, - directories_.GetDbDir(), directories_.GetDataDir(0U), - GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, - &event_logger_, mutable_cf_options.report_bg_io_stats); + // Contain a list of merge operations if merge occurs. + MergeContext merge_context; - FileMetaData file_meta; + // Note: this always resizes the values array + size_t num_keys = keys.size(); + std::vector stat_list(num_keys); + values->resize(num_keys); - flush_job.PickMemTable(); + // Keep track of bytes that we read for statistics-recording later + uint64_t bytes_read = 0; + PERF_TIMER_STOP(get_snapshot_time); - Status s; - if (logfile_number_ > 0 && - versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 0) { - // If there are more than one column families, we need to make sure that - // all the log files except the most recent one are synced. Otherwise if - // the host crashes after flushing and before WAL is persistent, the - // flushed SST may contain data from write batches whose updates to - // other column families are missing. - // SyncClosedLogs() may unlock and re-lock the db_mutex. - s = SyncClosedLogs(job_context); - } + // For each of the given keys, apply the entire "get" process as follows: + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + for (size_t i = 0; i < num_keys; ++i) { + merge_context.Clear(); + Status& s = stat_list[i]; + std::string* value = &(*values)[i]; - // Within flush_job.Run, rocksdb may call event listener to notify - // file creation and deletion. - // - // Note that flush_job.Run will unlock and lock the db_mutex, - // and EventListener callback will be called when the db_mutex - // is unlocked by the current thread. - if (s.ok()) { - s = flush_job.Run(&file_meta); - } else { - flush_job.Cancel(); - } + LookupKey lkey(keys[i], snapshot); + auto cfh = reinterpret_cast(column_family[i]); + RangeDelAggregator range_del_agg(cfh->cfd()->internal_comparator(), + snapshot); + auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); + assert(mgd_iter != multiget_cf_data.end()); + auto mgd = mgd_iter->second; + auto super_version = mgd->super_version; + bool skip_memtable = + (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + bool done = false; + if (!skip_memtable) { + if (super_version->mem->Get(lkey, value, &s, &merge_context, + &range_del_agg, read_options)) { + done = true; + // TODO(?): RecordTick(stats_, MEMTABLE_HIT)? + } else if (super_version->imm->Get(lkey, value, &s, &merge_context, + &range_del_agg, read_options)) { + done = true; + // TODO(?): RecordTick(stats_, MEMTABLE_HIT)? + } + } + if (!done) { + PinnableSlice pinnable_val; + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->Get(read_options, lkey, &pinnable_val, &s, + &merge_context, &range_del_agg); + value->assign(pinnable_val.data(), pinnable_val.size()); + // TODO(?): RecordTick(stats_, MEMTABLE_MISS)? + } - if (s.ok()) { - InstallSuperVersionAndScheduleWorkWrapper(cfd, job_context, - mutable_cf_options); - if (made_progress) { - *made_progress = 1; + if (s.ok()) { + bytes_read += value->size(); } - VersionStorageInfo::LevelSummaryStorage tmp; - ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n", - cfd->GetName().c_str(), - cfd->current()->storage_info()->LevelSummary(&tmp)); } - if (!s.ok() && !s.IsShutdownInProgress() && - immutable_db_options_.paranoid_checks && bg_error_.ok()) { - // if a bad error happened (not ShutdownInProgress) and paranoid_checks is - // true, mark DB read-only - bg_error_ = s; - } - if (s.ok()) { -#ifndef ROCKSDB_LITE - // may temporarily unlock and lock the mutex. - NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options, - job_context->job_id, flush_job.GetTableProperties()); - auto sfm = static_cast( - immutable_db_options_.sst_file_manager.get()); - if (sfm) { - // Notify sst_file_manager that a new file was added - std::string file_path = MakeTableFileName( - immutable_db_options_.db_paths[0].path, file_meta.fd.GetNumber()); - sfm->OnAddFile(file_path); - if (sfm->IsMaxAllowedSpaceReached() && bg_error_.ok()) { - bg_error_ = Status::IOError("Max allowed space was reached"); - TEST_SYNC_POINT_CALLBACK( - "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", - &bg_error_); - } + // Post processing (decrement reference counts and record statistics) + PERF_TIMER_GUARD(get_post_process_time); + autovector superversions_to_delete; + + // TODO(icanadi) do we need lock here or just around Cleanup()? + mutex_.Lock(); + for (auto mgd_iter : multiget_cf_data) { + auto mgd = mgd_iter.second; + if (mgd->super_version->Unref()) { + mgd->super_version->Cleanup(); + superversions_to_delete.push_back(mgd->super_version); } -#endif // ROCKSDB_LITE } - return s; -} + mutex_.Unlock(); -void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd, - FileMetaData* file_meta, - const MutableCFOptions& mutable_cf_options, - int job_id, TableProperties prop) { -#ifndef ROCKSDB_LITE - if (immutable_db_options_.listeners.size() == 0U) { - return; - } - mutex_.AssertHeld(); - if (shutting_down_.load(std::memory_order_acquire)) { - return; + for (auto td : superversions_to_delete) { + delete td; } - bool triggered_writes_slowdown = - (cfd->current()->storage_info()->NumLevelFiles(0) >= - mutable_cf_options.level0_slowdown_writes_trigger); - bool triggered_writes_stop = - (cfd->current()->storage_info()->NumLevelFiles(0) >= - mutable_cf_options.level0_stop_writes_trigger); - // release lock while notifying events - mutex_.Unlock(); - { - FlushJobInfo info; - info.cf_name = cfd->GetName(); - // TODO(yhchiang): make db_paths dynamic in case flush does not - // go to L0 in the future. - info.file_path = MakeTableFileName(immutable_db_options_.db_paths[0].path, - file_meta->fd.GetNumber()); - info.thread_id = env_->GetThreadID(); - info.job_id = job_id; - info.triggered_writes_slowdown = triggered_writes_slowdown; - info.triggered_writes_stop = triggered_writes_stop; - info.smallest_seqno = file_meta->smallest_seqno; - info.largest_seqno = file_meta->largest_seqno; - info.table_properties = prop; - for (auto listener : immutable_db_options_.listeners) { - listener->OnFlushCompleted(this, info); - } + for (auto mgd : multiget_cf_data) { + delete mgd.second; } - mutex_.Lock(); - // no need to signal bg_cv_ as it will be signaled at the end of the - // flush process. -#endif // ROCKSDB_LITE -} -Status DBImpl::CompactRange(const CompactRangeOptions& options, - ColumnFamilyHandle* column_family, - const Slice* begin, const Slice* end) { - if (options.target_path_id >= immutable_db_options_.db_paths.size()) { - return Status::InvalidArgument("Invalid target path ID"); - } + RecordTick(stats_, NUMBER_MULTIGET_CALLS); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); + RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); + MeasureTime(stats_, BYTES_PER_MULTIGET, bytes_read); + PERF_TIMER_STOP(get_post_process_time); - auto cfh = reinterpret_cast(column_family); - auto cfd = cfh->cfd(); - bool exclusive = options.exclusive_manual_compaction; + return stat_list; +} + +Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) { + Status s; + Status persist_options_status; + *handle = nullptr; - Status s = FlushMemTable(cfd, FlushOptions()); + s = CheckCompressionSupported(cf_options); + if (s.ok() && immutable_db_options_.allow_concurrent_memtable_write) { + s = CheckConcurrentWritesSupported(cf_options); + } if (!s.ok()) { - LogFlush(immutable_db_options_.info_log); return s; } - int max_level_with_files = 0; { InstrumentedMutexLock l(&mutex_); - Version* base = cfd->current(); - for (int level = 1; level < base->storage_info()->num_non_empty_levels(); - level++) { - if (base->storage_info()->OverlapInLevel(level, begin, end)) { - max_level_with_files = level; - } + + if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) != + nullptr) { + return Status::InvalidArgument("Column family already exists"); } - } + VersionEdit edit; + edit.AddColumnFamily(column_family_name); + uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); + edit.SetColumnFamily(new_id); + edit.SetLogNumber(logfile_number_); + edit.SetComparatorName(cf_options.comparator->Name()); - int final_output_level = 0; - if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal && - cfd->NumberLevels() > 1) { - // Always compact all files together. - s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, - cfd->NumberLevels() - 1, options.target_path_id, - begin, end, exclusive); - final_output_level = cfd->NumberLevels() - 1; - } else { - for (int level = 0; level <= max_level_with_files; level++) { - int output_level; - // in case the compaction is universal or if we're compacting the - // bottom-most level, the output level will be the same as input one. - // level 0 can never be the bottommost level (i.e. if all files are in - // level 0, we will compact to level 1) - if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || - cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { - output_level = level; - } else if (level == max_level_with_files && level > 0) { - if (options.bottommost_level_compaction == - BottommostLevelCompaction::kSkip) { - // Skip bottommost level compaction - continue; - } else if (options.bottommost_level_compaction == - BottommostLevelCompaction::kIfHaveCompactionFilter && - cfd->ioptions()->compaction_filter == nullptr && - cfd->ioptions()->compaction_filter_factory == nullptr) { - // Skip bottommost level compaction since we don't have a compaction - // filter - continue; - } - output_level = level; - } else { - output_level = level + 1; - if (cfd->ioptions()->compaction_style == kCompactionStyleLevel && - cfd->ioptions()->level_compaction_dynamic_level_bytes && - level == 0) { - output_level = ColumnFamilyData::kCompactToBaseLevel; - } - } - s = RunManualCompaction(cfd, level, output_level, options.target_path_id, - begin, end, exclusive); - if (!s.ok()) { - break; - } - if (output_level == ColumnFamilyData::kCompactToBaseLevel) { - final_output_level = cfd->NumberLevels() - 1; - } else if (output_level > final_output_level) { - final_output_level = output_level; + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + { // write thread + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit, + &mutex_, directories_.GetDbDir(), false, + &cf_options); + + if (s.ok()) { + // If the column family was created successfully, we then persist + // the updated RocksDB options under the same single write thread + persist_options_status = WriteOptionsFile(); } - TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); - TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); + write_thread_.ExitUnbatched(&w); } - } - if (!s.ok()) { - LogFlush(immutable_db_options_.info_log); - return s; - } - - if (options.change_level) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "[RefitLevel] waiting for background threads to stop"); - s = PauseBackgroundWork(); if (s.ok()) { - s = ReFitLevel(cfd, final_output_level, options.target_level); + single_column_family_mode_ = false; + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); + assert(cfd != nullptr); + delete InstallSuperVersionAndScheduleWork( + cfd, nullptr, *cfd->GetLatestMutableCFOptions()); + + if (!cfd->mem()->IsSnapshotSupported()) { + is_snapshot_supported_ = false; + } + + *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Created column family [%s] (ID %u)", + column_family_name.c_str(), (unsigned)cfd->GetID()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Creating column family [%s] FAILED -- %s", + column_family_name.c_str(), s.ToString().c_str()); } - ContinueBackgroundWork(); - } - LogFlush(immutable_db_options_.info_log); + } // InstrumentedMutexLock l(&mutex_) - { - InstrumentedMutexLock l(&mutex_); - // an automatic compaction that has been scheduled might have been - // preempted by the manual compactions. Need to schedule it back. - MaybeScheduleFlushOrCompaction(); + // this is outside the mutex + if (s.ok()) { + NewThreadStatusCfInfo( + reinterpret_cast(*handle)->cfd()); + if (!persist_options_status.ok()) { + if (immutable_db_options_.fail_if_options_file_error) { + s = Status::IOError( + "ColumnFamily has been created, but unable to persist" + "options in CreateColumnFamily()", + persist_options_status.ToString().c_str()); + } + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to persist options in CreateColumnFamily() -- %s", + persist_options_status.ToString().c_str()); + } } - return s; } -Status DBImpl::CompactFiles( - const CompactionOptions& compact_options, - ColumnFamilyHandle* column_family, - const std::vector& input_file_names, - const int output_level, const int output_path_id) { -#ifdef ROCKSDB_LITE - // not supported in lite version - return Status::NotSupported("Not supported in ROCKSDB LITE"); -#else - if (column_family == nullptr) { - return Status::InvalidArgument("ColumnFamilyHandle must be non-null."); +Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + if (cfd->GetID() == 0) { + return Status::InvalidArgument("Can't drop default column family"); } - auto cfd = reinterpret_cast(column_family)->cfd(); - assert(cfd); + bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported(); - Status s; - JobContext job_context(0, true); - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, - immutable_db_options_.info_log.get()); + VersionEdit edit; + edit.DropColumnFamily(); + edit.SetColumnFamily(cfd->GetID()); - // Perform CompactFiles - SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); + Status s; + Status options_persist_status; { InstrumentedMutexLock l(&mutex_); - - // This call will unlock/lock the mutex to wait for current running - // IngestExternalFile() calls to finish. - WaitForIngestFile(); - - s = CompactFilesImpl(compact_options, cfd, sv->current, - input_file_names, output_level, - output_path_id, &job_context, &log_buffer); - } - if (sv->Unref()) { - mutex_.Lock(); - sv->Cleanup(); - mutex_.Unlock(); - delete sv; - } - - // Find and delete obsolete files - { - InstrumentedMutexLock l(&mutex_); - // If !s.ok(), this means that Compaction failed. In that case, we want - // to delete all obsolete files we might have created and we force - // FindObsoleteFiles(). This is because job_context does not - // catch all created files if compaction failed. - FindObsoleteFiles(&job_context, !s.ok()); - } // release the mutex - - // delete unnecessary files if any, this is done outside the mutex - if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { - // Have to flush the info logs before bg_compaction_scheduled_-- - // because if bg_flush_scheduled_ becomes 0 and the lock is - // released, the deconstructor of DB can kick in and destroy all the - // states of DB so info_log might not be available after that point. - // It also applies to access other states that DB owns. - log_buffer.FlushBufferToLog(); - if (job_context.HaveSomethingToDelete()) { - // no mutex is locked here. No need to Unlock() and Lock() here. - PurgeObsoleteFiles(job_context); - } - job_context.Clean(); - } - - return s; -#endif // ROCKSDB_LITE -} - -#ifndef ROCKSDB_LITE -Status DBImpl::CompactFilesImpl( - const CompactionOptions& compact_options, ColumnFamilyData* cfd, - Version* version, const std::vector& input_file_names, - const int output_level, int output_path_id, JobContext* job_context, - LogBuffer* log_buffer) { - mutex_.AssertHeld(); - - if (shutting_down_.load(std::memory_order_acquire)) { - return Status::ShutdownInProgress(); - } - - std::unordered_set input_set; - for (auto file_name : input_file_names) { - input_set.insert(TableFileNameToNumber(file_name)); - } - - ColumnFamilyMetaData cf_meta; - // TODO(yhchiang): can directly use version here if none of the - // following functions call is pluggable to external developers. - version->GetColumnFamilyMetaData(&cf_meta); - - if (output_path_id < 0) { - if (immutable_db_options_.db_paths.size() == 1U) { - output_path_id = 0; - } else { - return Status::NotSupported( - "Automatic output path selection is not " - "yet supported in CompactFiles()"); - } - } - - Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles( - &input_set, cf_meta, output_level); - if (!s.ok()) { - return s; - } - - std::vector input_files; - s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( - &input_files, &input_set, version->storage_info(), compact_options); - if (!s.ok()) { - return s; - } - - for (auto inputs : input_files) { - if (cfd->compaction_picker()->FilesInCompaction(inputs.files)) { - return Status::Aborted( - "Some of the necessary compaction input " - "files are already being compacted"); + if (cfd->IsDropped()) { + s = Status::InvalidArgument("Column family already dropped!\n"); } - } - - // At this point, CompactFiles will be run. - bg_compaction_scheduled_++; - - unique_ptr c; - assert(cfd->compaction_picker()); - c.reset(cfd->compaction_picker()->FormCompaction( - compact_options, input_files, output_level, version->storage_info(), - *cfd->GetLatestMutableCFOptions(), output_path_id)); - if (!c) { - return Status::Aborted("Another Level 0 compaction is running"); - } - c->SetInputVersion(version); - // deletion compaction currently not allowed in CompactFiles. - assert(!c->deletion_compaction()); - - SequenceNumber earliest_write_conflict_snapshot; - std::vector snapshot_seqs = - snapshots_.GetAll(&earliest_write_conflict_snapshot); - - auto pending_outputs_inserted_elem = - CaptureCurrentFileNumberInPendingOutputs(); - - assert(is_snapshot_supported_ || snapshots_.empty()); - CompactionJob compaction_job( - job_context->job_id, c.get(), immutable_db_options_, env_options_, - versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(), - directories_.GetDataDir(c->output_path_id()), stats_, &mutex_, &bg_error_, - snapshot_seqs, earliest_write_conflict_snapshot, table_cache_, - &event_logger_, c->mutable_cf_options()->paranoid_file_checks, - c->mutable_cf_options()->report_bg_io_stats, dbname_, - nullptr); // Here we pass a nullptr for CompactionJobStats because - // CompactFiles does not trigger OnCompactionCompleted(), - // which is the only place where CompactionJobStats is - // returned. The idea of not triggering OnCompationCompleted() - // is that CompactFiles runs in the caller thread, so the user - // should always know when it completes. As a result, it makes - // less sense to notify the users something they should already - // know. - // - // In the future, if we would like to add CompactionJobStats - // support for CompactFiles, we should have CompactFiles API - // pass a pointer of CompactionJobStats as the out-value - // instead of using EventListener. - - // Creating a compaction influences the compaction score because the score - // takes running compactions into account (by skipping files that are already - // being compacted). Since we just changed compaction score, we recalculate it - // here. - version->storage_info()->ComputeCompactionScore(*cfd->ioptions(), - *c->mutable_cf_options()); - - compaction_job.Prepare(); - - mutex_.Unlock(); - TEST_SYNC_POINT("CompactFilesImpl:0"); - TEST_SYNC_POINT("CompactFilesImpl:1"); - compaction_job.Run(); - TEST_SYNC_POINT("CompactFilesImpl:2"); - TEST_SYNC_POINT("CompactFilesImpl:3"); - mutex_.Lock(); - - Status status = compaction_job.Install(*c->mutable_cf_options()); - if (status.ok()) { - InstallSuperVersionAndScheduleWorkWrapper( - c->column_family_data(), job_context, *c->mutable_cf_options()); - } - c->ReleaseCompactionFiles(s); - - ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); - - if (status.ok()) { - // Done - } else if (status.IsShutdownInProgress()) { - // Ignore compaction errors found during shutting down - } else { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "[%s] [JOB %d] Compaction error: %s", - c->column_family_data()->GetName().c_str(), - job_context->job_id, status.ToString().c_str()); - if (immutable_db_options_.paranoid_checks && bg_error_.ok()) { - bg_error_ = status; + if (s.ok()) { + // we drop column family from a single write thread + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_); + if (s.ok()) { + // If the column family was dropped successfully, we then persist + // the updated RocksDB options under the same single write thread + options_persist_status = WriteOptionsFile(); + } + write_thread_.ExitUnbatched(&w); } - } - - c.reset(); - - bg_compaction_scheduled_--; - if (bg_compaction_scheduled_ == 0) { - bg_cv_.SignalAll(); - } - - return status; -} -#endif // ROCKSDB_LITE - -Status DBImpl::PauseBackgroundWork() { - InstrumentedMutexLock guard_lock(&mutex_); - bg_compaction_paused_++; - while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_ > 0) { - bg_cv_.Wait(); - } - bg_work_paused_++; - return Status::OK(); -} - -Status DBImpl::ContinueBackgroundWork() { - InstrumentedMutexLock guard_lock(&mutex_); - if (bg_work_paused_ == 0) { - return Status::InvalidArgument(); - } - assert(bg_work_paused_ > 0); - assert(bg_compaction_paused_ > 0); - bg_compaction_paused_--; - bg_work_paused_--; - // It's sufficient to check just bg_work_paused_ here since - // bg_work_paused_ is always no greater than bg_compaction_paused_ - if (bg_work_paused_ == 0) { - MaybeScheduleFlushOrCompaction(); - } - return Status::OK(); -} -void DBImpl::NotifyOnCompactionCompleted( - ColumnFamilyData* cfd, Compaction *c, const Status &st, - const CompactionJobStats& compaction_job_stats, - const int job_id) { -#ifndef ROCKSDB_LITE - if (immutable_db_options_.listeners.size() == 0U) { - return; - } - mutex_.AssertHeld(); - if (shutting_down_.load(std::memory_order_acquire)) { - return; - } - // release lock while notifying events - mutex_.Unlock(); - TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex"); - { - CompactionJobInfo info; - info.cf_name = cfd->GetName(); - info.status = st; - info.thread_id = env_->GetThreadID(); - info.job_id = job_id; - info.base_input_level = c->start_level(); - info.output_level = c->output_level(); - info.stats = compaction_job_stats; - info.table_properties = c->GetOutputTableProperties(); - info.compaction_reason = c->compaction_reason(); - info.compression = c->output_compression(); - for (size_t i = 0; i < c->num_input_levels(); ++i) { - for (const auto fmd : *c->inputs(i)) { - auto fn = TableFileName(immutable_db_options_.db_paths, - fmd->fd.GetNumber(), fmd->fd.GetPathId()); - info.input_files.push_back(fn); - if (info.table_properties.count(fn) == 0) { - std::shared_ptr tp; - auto s = cfd->current()->GetTableProperties(&tp, fmd, &fn); - if (s.ok()) { - info.table_properties[fn] = tp; - } + if (!cf_support_snapshot) { + // Dropped Column Family doesn't support snapshot. Need to recalculate + // is_snapshot_supported_. + bool new_is_snapshot_supported = true; + for (auto c : *versions_->GetColumnFamilySet()) { + if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) { + new_is_snapshot_supported = false; + break; } } - } - for (const auto newf : c->edit()->GetNewFiles()) { - info.output_files.push_back(TableFileName(immutable_db_options_.db_paths, - newf.second.fd.GetNumber(), - newf.second.fd.GetPathId())); - } - for (auto listener : immutable_db_options_.listeners) { - listener->OnCompactionCompleted(this, info); - } - } - mutex_.Lock(); - // no need to signal bg_cv_ as it will be signaled at the end of the - // flush process. -#endif // ROCKSDB_LITE -} - -Status DBImpl::SetOptions(ColumnFamilyHandle* column_family, - const std::unordered_map& options_map) { -#ifdef ROCKSDB_LITE - return Status::NotSupported("Not supported in ROCKSDB LITE"); -#else - auto* cfd = reinterpret_cast(column_family)->cfd(); - if (options_map.empty()) { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "SetOptions() on column family [%s], empty input", - cfd->GetName().c_str()); - return Status::InvalidArgument("empty input"); - } - - MutableCFOptions new_options; - Status s; - Status persist_options_status; - WriteThread::Writer w; - { - InstrumentedMutexLock l(&mutex_); - s = cfd->SetOptions(options_map); - if (s.ok()) { - new_options = *cfd->GetLatestMutableCFOptions(); - // Append new version to recompute compaction score. - VersionEdit dummy_edit; - versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_, - directories_.GetDbDir()); - // Trigger possible flush/compactions. This has to be before we persist - // options to file, otherwise there will be a deadlock with writer - // thread. - auto* old_sv = - InstallSuperVersionAndScheduleWork(cfd, nullptr, new_options); - delete old_sv; - - write_thread_.EnterUnbatched(&w, &mutex_); - persist_options_status = WriteOptionsFile(); - write_thread_.ExitUnbatched(&w); + is_snapshot_supported_ = new_is_snapshot_supported; } } - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "SetOptions() on column family [%s], inputs:", - cfd->GetName().c_str()); - for (const auto& o : options_map) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(), - o.second.c_str()); - } if (s.ok()) { + // Note that here we erase the associated cf_info of the to-be-dropped + // cfd before its ref-count goes to zero to avoid having to erase cf_info + // later inside db_mutex. + EraseThreadStatusCfInfo(cfd); + assert(cfd->IsDropped()); + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; ROCKS_LOG_INFO(immutable_db_options_.info_log, - "[%s] SetOptions() succeeded", cfd->GetName().c_str()); - new_options.Dump(immutable_db_options_.info_log.get()); - if (!persist_options_status.ok()) { + "Dropped column family with id %u\n", cfd->GetID()); + + if (!options_persist_status.ok()) { if (immutable_db_options_.fail_if_options_file_error) { s = Status::IOError( - "SetOptions() succeeded, but unable to persist options", - persist_options_status.ToString()); + "ColumnFamily has been dropped, but unable to persist " + "options in DropColumnFamily()", + options_persist_status.ToString().c_str()); } ROCKS_LOG_WARN(immutable_db_options_.info_log, - "Unable to persist options in SetOptions() -- %s", - persist_options_status.ToString().c_str()); + "Unable to persist options in DropColumnFamily() -- %s", + options_persist_status.ToString().c_str()); } } else { - ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed", - cfd->GetName().c_str()); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Dropping column family with id %u FAILED -- %s\n", + cfd->GetID(), s.ToString().c_str()); } - LogFlush(immutable_db_options_.info_log); + return s; -#endif // ROCKSDB_LITE } -Status DBImpl::SetDBOptions( - const std::unordered_map& options_map) { -#ifdef ROCKSDB_LITE - return Status::NotSupported("Not supported in ROCKSDB LITE"); -#else - if (options_map.empty()) { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "SetDBOptions(), empty input."); - return Status::InvalidArgument("empty input"); +bool DBImpl::KeyMayExist(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found) { + assert(value != nullptr); + if (value_found != nullptr) { + // falsify later if key-may-exist but can't fetch value + *value_found = true; } + ReadOptions roptions = read_options; + roptions.read_tier = kBlockCacheTier; // read from block cache only + PinnableSlice pinnable_val; + auto s = GetImpl(roptions, column_family, key, &pinnable_val, value_found); + value->assign(pinnable_val.data(), pinnable_val.size()); - MutableDBOptions new_options; - Status s; - Status persist_options_status; - WriteThread::Writer w; - WriteContext write_context; - { - InstrumentedMutexLock l(&mutex_); - s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map, - &new_options); - if (s.ok()) { - if (new_options.max_background_compactions > - mutable_db_options_.max_background_compactions) { - env_->IncBackgroundThreadsIfNeeded( - new_options.max_background_compactions, Env::Priority::LOW); - MaybeScheduleFlushOrCompaction(); - } - - write_controller_.set_max_delayed_write_rate(new_options.delayed_write_rate); - - mutable_db_options_ = new_options; + // If block_cache is enabled and the index block of the table didn't + // not present in block_cache, the return value will be Status::Incomplete. + // In this case, key may still exist in the table. + return s.ok() || s.IsIncomplete(); +} - write_thread_.EnterUnbatched(&w, &mutex_); - if (total_log_size_ > GetMaxTotalWalSize()) { - Status purge_wal_status = HandleWALFull(&write_context); - if (!purge_wal_status.ok()) { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "Unable to purge WAL files in SetDBOptions() -- %s", - purge_wal_status.ToString().c_str()); - } - } - persist_options_status = WriteOptionsFile(); - write_thread_.ExitUnbatched(&w); - } - } - ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:"); - for (const auto& o : options_map) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(), - o.second.c_str()); +Iterator* DBImpl::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + if (read_options.read_tier == kPersistedTier) { + return NewErrorIterator(Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators.")); } - if (s.ok()) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded"); - new_options.Dump(immutable_db_options_.info_log.get()); - if (!persist_options_status.ok()) { - if (immutable_db_options_.fail_if_options_file_error) { - s = Status::IOError( - "SetDBOptions() succeeded, but unable to persist options", - persist_options_status.ToString()); - } - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "Unable to persist options in SetDBOptions() -- %s", - persist_options_status.ToString().c_str()); - } - } else { - ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed"); - } - LogFlush(immutable_db_options_.info_log); - return s; -#endif // ROCKSDB_LITE -} - -// return the same level if it cannot be moved -int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, - const MutableCFOptions& mutable_cf_options, int level) { - mutex_.AssertHeld(); - const auto* vstorage = cfd->current()->storage_info(); - int minimum_level = level; - for (int i = level - 1; i > 0; --i) { - // stop if level i is not empty - if (vstorage->NumLevelFiles(i) > 0) break; - // stop if level i is too small (cannot fit the level files) - if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) { - break; - } - - minimum_level = i; - } - return minimum_level; -} - -// REQUIREMENT: block all background work by calling PauseBackgroundWork() -// before calling this function -Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { - assert(level < cfd->NumberLevels()); - if (target_level >= cfd->NumberLevels()) { - return Status::InvalidArgument("Target level exceeds number of levels"); - } - - std::unique_ptr superversion_to_free; - std::unique_ptr new_superversion(new SuperVersion()); - - Status status; - - InstrumentedMutexLock guard_lock(&mutex_); - - // only allow one thread refitting - if (refitting_level_) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "[ReFitLevel] another thread is refitting"); - return Status::NotSupported("another thread is refitting"); - } - refitting_level_ = true; - - const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); - // move to a smaller level - int to_level = target_level; - if (target_level < 0) { - to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level); - } - - auto* vstorage = cfd->current()->storage_info(); - if (to_level > level) { - if (level == 0) { - return Status::NotSupported( - "Cannot change from level 0 to other levels."); - } - // Check levels are empty for a trivial move - for (int l = level + 1; l <= to_level; l++) { - if (vstorage->NumLevelFiles(l) > 0) { - return Status::NotSupported( - "Levels between source and target are not empty for a move."); - } - } - } - if (to_level != level) { - ROCKS_LOG_DEBUG(immutable_db_options_.info_log, - "[%s] Before refitting:\n%s", cfd->GetName().c_str(), - cfd->current()->DebugString().data()); - - VersionEdit edit; - edit.SetColumnFamily(cfd->GetID()); - for (const auto& f : vstorage->LevelFiles(level)) { - edit.DeleteFile(level, f->fd.GetNumber()); - edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(), - f->fd.GetFileSize(), f->smallest, f->largest, - f->smallest_seqno, f->largest_seqno, - f->marked_for_compaction); - } - ROCKS_LOG_DEBUG(immutable_db_options_.info_log, - "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), - edit.DebugString().data()); - - status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_, - directories_.GetDbDir()); - superversion_to_free.reset(InstallSuperVersionAndScheduleWork( - cfd, new_superversion.release(), mutable_cf_options)); - - ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n", - cfd->GetName().c_str(), status.ToString().data()); - - if (status.ok()) { - ROCKS_LOG_DEBUG(immutable_db_options_.info_log, - "[%s] After refitting:\n%s", cfd->GetName().c_str(), - cfd->current()->DebugString().data()); - } - } - - refitting_level_ = false; - - return status; -} - -int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { - auto cfh = reinterpret_cast(column_family); - return cfh->cfd()->NumberLevels(); -} - -int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) { - return 0; -} - -int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { - auto cfh = reinterpret_cast(column_family); - InstrumentedMutexLock l(&mutex_); - return cfh->cfd()->GetSuperVersion()-> - mutable_cf_options.level0_stop_writes_trigger; -} - -Status DBImpl::Flush(const FlushOptions& flush_options, - ColumnFamilyHandle* column_family) { - auto cfh = reinterpret_cast(column_family); - return FlushMemTable(cfh->cfd(), flush_options); -} - -Status DBImpl::SyncWAL() { - autovector logs_to_sync; - bool need_log_dir_sync; - uint64_t current_log_number; - - { - InstrumentedMutexLock l(&mutex_); - assert(!logs_.empty()); - - // This SyncWAL() call only cares about logs up to this number. - current_log_number = logfile_number_; - - while (logs_.front().number <= current_log_number && - logs_.front().getting_synced) { - log_sync_cv_.Wait(); - } - // First check that logs are safe to sync in background. - for (auto it = logs_.begin(); - it != logs_.end() && it->number <= current_log_number; ++it) { - if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) { - return Status::NotSupported( - "SyncWAL() is not supported for this implementation of WAL file", - immutable_db_options_.allow_mmap_writes - ? "try setting Options::allow_mmap_writes to false" - : Slice()); - } - } - for (auto it = logs_.begin(); - it != logs_.end() && it->number <= current_log_number; ++it) { - auto& log = *it; - assert(!log.getting_synced); - log.getting_synced = true; - logs_to_sync.push_back(log.writer); - } - - need_log_dir_sync = !log_dir_synced_; - } - - RecordTick(stats_, WAL_FILE_SYNCED); - Status status; - for (log::Writer* log : logs_to_sync) { - status = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync); - if (!status.ok()) { - break; - } - } - if (status.ok() && need_log_dir_sync) { - status = directories_.GetWalDir()->Fsync(); - } - - TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); - { - InstrumentedMutexLock l(&mutex_); - MarkLogsSynced(current_log_number, need_log_dir_sync, status); - } - TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); - - return status; -} - -void DBImpl::MarkLogsSynced( - uint64_t up_to, bool synced_dir, const Status& status) { - mutex_.AssertHeld(); - if (synced_dir && - logfile_number_ == up_to && - status.ok()) { - log_dir_synced_ = true; - } - for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) { - auto& log = *it; - assert(log.getting_synced); - if (status.ok() && logs_.size() > 1) { - logs_to_free_.push_back(log.ReleaseWriter()); - it = logs_.erase(it); - } else { - log.getting_synced = false; - ++it; - } - } - assert(!status.ok() || logs_.empty() || logs_[0].number > up_to || - (logs_.size() == 1 && !logs_[0].getting_synced)); - log_sync_cv_.SignalAll(); -} - -SequenceNumber DBImpl::GetLatestSequenceNumber() const { - return versions_->LastSequence(); -} - -Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, - int output_level, uint32_t output_path_id, - const Slice* begin, const Slice* end, - bool exclusive, bool disallow_trivial_move) { - assert(input_level == ColumnFamilyData::kCompactAllLevels || - input_level >= 0); - - InternalKey begin_storage, end_storage; - CompactionArg* ca; - - bool scheduled = false; - bool manual_conflict = false; - ManualCompaction manual; - manual.cfd = cfd; - manual.input_level = input_level; - manual.output_level = output_level; - manual.output_path_id = output_path_id; - manual.done = false; - manual.in_progress = false; - manual.incomplete = false; - manual.exclusive = exclusive; - manual.disallow_trivial_move = disallow_trivial_move; - // For universal compaction, we enforce every manual compaction to compact - // all files. - if (begin == nullptr || - cfd->ioptions()->compaction_style == kCompactionStyleUniversal || - cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { - manual.begin = nullptr; - } else { - begin_storage.SetMaxPossibleForUserKey(*begin); - manual.begin = &begin_storage; - } - if (end == nullptr || - cfd->ioptions()->compaction_style == kCompactionStyleUniversal || - cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { - manual.end = nullptr; - } else { - end_storage.SetMinPossibleForUserKey(*end); - manual.end = &end_storage; - } - - TEST_SYNC_POINT("DBImpl::RunManualCompaction:0"); - TEST_SYNC_POINT("DBImpl::RunManualCompaction:1"); - InstrumentedMutexLock l(&mutex_); - - // When a manual compaction arrives, temporarily disable scheduling of - // non-manual compactions and wait until the number of scheduled compaction - // jobs drops to zero. This is needed to ensure that this manual compaction - // can compact any range of keys/files. - // - // HasPendingManualCompaction() is true when at least one thread is inside - // RunManualCompaction(), i.e. during that time no other compaction will - // get scheduled (see MaybeScheduleFlushOrCompaction). - // - // Note that the following loop doesn't stop more that one thread calling - // RunManualCompaction() from getting to the second while loop below. - // However, only one of them will actually schedule compaction, while - // others will wait on a condition variable until it completes. - - AddManualCompaction(&manual); - TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_); - if (exclusive) { - while (bg_compaction_scheduled_ > 0) { - ROCKS_LOG_INFO( - immutable_db_options_.info_log, - "[%s] Manual compaction waiting for all other scheduled background " - "compactions to finish", - cfd->GetName().c_str()); - bg_cv_.Wait(); - } - } - - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "[%s] Manual compaction starting", cfd->GetName().c_str()); - - // We don't check bg_error_ here, because if we get the error in compaction, - // the compaction will set manual.status to bg_error_ and set manual.done to - // true. - while (!manual.done) { - assert(HasPendingManualCompaction()); - manual_conflict = false; - if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) || - scheduled || - ((manual.manual_end = &manual.tmp_storage1)&&( - (manual.compaction = manual.cfd->CompactRange( - *manual.cfd->GetLatestMutableCFOptions(), manual.input_level, - manual.output_level, manual.output_path_id, manual.begin, - manual.end, &manual.manual_end, &manual_conflict)) == - nullptr) && - manual_conflict)) { - // exclusive manual compactions should not see a conflict during - // CompactRange - assert(!exclusive || !manual_conflict); - // Running either this or some other manual compaction - bg_cv_.Wait(); - if (scheduled && manual.incomplete == true) { - assert(!manual.in_progress); - scheduled = false; - manual.incomplete = false; - } - } else if (!scheduled) { - if (manual.compaction == nullptr) { - manual.done = true; - bg_cv_.SignalAll(); - continue; - } - ca = new CompactionArg; - ca->db = this; - ca->m = &manual; - manual.incomplete = false; - bg_compaction_scheduled_++; - env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, - &DBImpl::UnscheduleCallback); - scheduled = true; - } - } - - assert(!manual.in_progress); - assert(HasPendingManualCompaction()); - RemoveManualCompaction(&manual); - bg_cv_.SignalAll(); - return manual.status; -} - -InternalIterator* DBImpl::NewInternalIterator( - Arena* arena, RangeDelAggregator* range_del_agg, - ColumnFamilyHandle* column_family) { - ColumnFamilyData* cfd; - if (column_family == nullptr) { - cfd = default_cf_handle_->cfd(); - } else { - auto cfh = reinterpret_cast(column_family); - cfd = cfh->cfd(); - } - - mutex_.Lock(); - SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); - mutex_.Unlock(); - ReadOptions roptions; - return NewInternalIterator(roptions, cfd, super_version, arena, - range_del_agg); -} - -Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, - const FlushOptions& flush_options, - bool writes_stopped) { - Status s; - { - WriteContext context; - InstrumentedMutexLock guard_lock(&mutex_); - - if (cfd->imm()->NumNotFlushed() == 0 && cfd->mem()->IsEmpty()) { - // Nothing to flush - return Status::OK(); - } - - WriteThread::Writer w; - if (!writes_stopped) { - write_thread_.EnterUnbatched(&w, &mutex_); - } - - // SwitchMemtable() will release and reacquire mutex - // during execution - s = SwitchMemtable(cfd, &context); - - if (!writes_stopped) { - write_thread_.ExitUnbatched(&w); - } - - cfd->imm()->FlushRequested(); - - // schedule flush - SchedulePendingFlush(cfd); - MaybeScheduleFlushOrCompaction(); - } - - if (s.ok() && flush_options.wait) { - // Wait until the compaction completes - s = WaitForFlushMemTable(cfd); - } - return s; -} - -Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) { - Status s; - // Wait until the compaction completes - InstrumentedMutexLock l(&mutex_); - while (cfd->imm()->NumNotFlushed() > 0 && bg_error_.ok()) { - if (shutting_down_.load(std::memory_order_acquire)) { - return Status::ShutdownInProgress(); - } - if (cfd->IsDropped()) { - // FlushJob cannot flush a dropped CF, if we did not break here - // we will loop forever since cfd->imm()->NumNotFlushed() will never - // drop to zero - return Status::InvalidArgument("Cannot flush a dropped CF"); - } - bg_cv_.Wait(); - } - if (!bg_error_.ok()) { - s = bg_error_; - } - return s; -} - -Status DBImpl::EnableAutoCompaction( - const std::vector& column_family_handles) { - Status s; - for (auto cf_ptr : column_family_handles) { - Status status = - this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}}); - if (!status.ok()) { - s = status; - } - } - - return s; -} - -void DBImpl::MaybeScheduleFlushOrCompaction() { - mutex_.AssertHeld(); - if (!opened_successfully_) { - // Compaction may introduce data race to DB open - return; - } - if (bg_work_paused_ > 0) { - // we paused the background work - return; - } else if (shutting_down_.load(std::memory_order_acquire)) { - // DB is being deleted; no more background compactions - return; - } - - while (unscheduled_flushes_ > 0 && - bg_flush_scheduled_ < immutable_db_options_.max_background_flushes) { - unscheduled_flushes_--; - bg_flush_scheduled_++; - env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this); - } - - auto bg_compactions_allowed = BGCompactionsAllowed(); - - // special case -- if max_background_flushes == 0, then schedule flush on a - // compaction thread - if (immutable_db_options_.max_background_flushes == 0) { - while (unscheduled_flushes_ > 0 && - bg_flush_scheduled_ + bg_compaction_scheduled_ < - bg_compactions_allowed) { - unscheduled_flushes_--; - bg_flush_scheduled_++; - env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::LOW, this); - } - } - - if (bg_compaction_paused_ > 0) { - // we paused the background compaction - return; - } - - if (HasExclusiveManualCompaction()) { - // only manual compactions are allowed to run. don't schedule automatic - // compactions - return; - } - - while (bg_compaction_scheduled_ < bg_compactions_allowed && - unscheduled_compactions_ > 0) { - CompactionArg* ca = new CompactionArg; - ca->db = this; - ca->m = nullptr; - bg_compaction_scheduled_++; - unscheduled_compactions_--; - env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, - &DBImpl::UnscheduleCallback); - } -} - -void DBImpl::SchedulePurge() { - mutex_.AssertHeld(); - assert(opened_successfully_); - - // Purge operations are put into High priority queue - bg_purge_scheduled_++; - env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr); -} - -int DBImpl::BGCompactionsAllowed() const { - mutex_.AssertHeld(); - if (write_controller_.NeedSpeedupCompaction()) { - return mutable_db_options_.max_background_compactions; - } else { - return mutable_db_options_.base_background_compactions; - } -} - -void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) { - assert(!cfd->pending_compaction()); - cfd->Ref(); - compaction_queue_.push_back(cfd); - cfd->set_pending_compaction(true); -} - -ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() { - assert(!compaction_queue_.empty()); - auto cfd = *compaction_queue_.begin(); - compaction_queue_.pop_front(); - assert(cfd->pending_compaction()); - cfd->set_pending_compaction(false); - return cfd; -} - -void DBImpl::AddToFlushQueue(ColumnFamilyData* cfd) { - assert(!cfd->pending_flush()); - cfd->Ref(); - flush_queue_.push_back(cfd); - cfd->set_pending_flush(true); -} - -ColumnFamilyData* DBImpl::PopFirstFromFlushQueue() { - assert(!flush_queue_.empty()); - auto cfd = *flush_queue_.begin(); - flush_queue_.pop_front(); - assert(cfd->pending_flush()); - cfd->set_pending_flush(false); - return cfd; -} - -void DBImpl::SchedulePendingFlush(ColumnFamilyData* cfd) { - if (!cfd->pending_flush() && cfd->imm()->IsFlushPending()) { - AddToFlushQueue(cfd); - ++unscheduled_flushes_; - } -} - -void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { - if (!cfd->pending_compaction() && cfd->NeedsCompaction()) { - AddToCompactionQueue(cfd); - ++unscheduled_compactions_; - } -} - -void DBImpl::SchedulePendingPurge(std::string fname, FileType type, - uint64_t number, uint32_t path_id, - int job_id) { - mutex_.AssertHeld(); - PurgeFileInfo file_info(fname, type, number, path_id, job_id); - purge_queue_.push_back(std::move(file_info)); -} - -void DBImpl::BGWorkFlush(void* db) { - IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH); - TEST_SYNC_POINT("DBImpl::BGWorkFlush"); - reinterpret_cast(db)->BackgroundCallFlush(); - TEST_SYNC_POINT("DBImpl::BGWorkFlush:done"); -} - -void DBImpl::BGWorkCompaction(void* arg) { - CompactionArg ca = *(reinterpret_cast(arg)); - delete reinterpret_cast(arg); - IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW); - TEST_SYNC_POINT("DBImpl::BGWorkCompaction"); - reinterpret_cast(ca.db)->BackgroundCallCompaction(ca.m); -} - -void DBImpl::BGWorkPurge(void* db) { - IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH); - TEST_SYNC_POINT("DBImpl::BGWorkPurge:start"); - reinterpret_cast(db)->BackgroundCallPurge(); - TEST_SYNC_POINT("DBImpl::BGWorkPurge:end"); -} - -void DBImpl::UnscheduleCallback(void* arg) { - CompactionArg ca = *(reinterpret_cast(arg)); - delete reinterpret_cast(arg); - if ((ca.m != nullptr) && (ca.m->compaction != nullptr)) { - delete ca.m->compaction; - } - TEST_SYNC_POINT("DBImpl::UnscheduleCallback"); -} - -void DBImpl::BackgroundCallPurge() { - mutex_.Lock(); - - // We use one single loop to clear both queues so that after existing the loop - // both queues are empty. This is stricter than what is needed, but can make - // it easier for us to reason the correctness. - while (!purge_queue_.empty() || !logs_to_free_queue_.empty()) { - if (!purge_queue_.empty()) { - auto purge_file = purge_queue_.begin(); - auto fname = purge_file->fname; - auto type = purge_file->type; - auto number = purge_file->number; - auto path_id = purge_file->path_id; - auto job_id = purge_file->job_id; - purge_queue_.pop_front(); - - mutex_.Unlock(); - Status file_deletion_status; - DeleteObsoleteFileImpl(file_deletion_status, job_id, fname, type, number, - path_id); - mutex_.Lock(); - } else { - assert(!logs_to_free_queue_.empty()); - log::Writer* log_writer = *(logs_to_free_queue_.begin()); - logs_to_free_queue_.pop_front(); - mutex_.Unlock(); - delete log_writer; - mutex_.Lock(); - } - } - bg_purge_scheduled_--; - - bg_cv_.SignalAll(); - // IMPORTANT:there should be no code after calling SignalAll. This call may - // signal the DB destructor that it's OK to proceed with destruction. In - // that case, all DB variables will be dealloacated and referencing them - // will cause trouble. - mutex_.Unlock(); -} - -Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, - LogBuffer* log_buffer) { - mutex_.AssertHeld(); - - Status status = bg_error_; - if (status.ok() && shutting_down_.load(std::memory_order_acquire)) { - status = Status::ShutdownInProgress(); - } - - if (!status.ok()) { - return status; - } - - ColumnFamilyData* cfd = nullptr; - while (!flush_queue_.empty()) { - // This cfd is already referenced - auto first_cfd = PopFirstFromFlushQueue(); - - if (first_cfd->IsDropped() || !first_cfd->imm()->IsFlushPending()) { - // can't flush this CF, try next one - if (first_cfd->Unref()) { - delete first_cfd; - } - continue; - } - - // found a flush! - cfd = first_cfd; - break; - } - - if (cfd != nullptr) { - const MutableCFOptions mutable_cf_options = - *cfd->GetLatestMutableCFOptions(); - ROCKS_LOG_BUFFER( - log_buffer, - "Calling FlushMemTableToOutputFile with column " - "family [%s], flush slots available %d, compaction slots allowed %d, " - "compaction slots scheduled %d", - cfd->GetName().c_str(), immutable_db_options_.max_background_flushes, - bg_flush_scheduled_, BGCompactionsAllowed() - bg_compaction_scheduled_); - status = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress, - job_context, log_buffer); - if (cfd->Unref()) { - delete cfd; - } - } - return status; -} - -void DBImpl::BackgroundCallFlush() { - bool made_progress = false; - JobContext job_context(next_job_id_.fetch_add(1), true); - assert(bg_flush_scheduled_); - - TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:start"); - - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, - immutable_db_options_.info_log.get()); - { - InstrumentedMutexLock l(&mutex_); - num_running_flushes_++; - - auto pending_outputs_inserted_elem = - CaptureCurrentFileNumberInPendingOutputs(); - - Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer); - if (!s.ok() && !s.IsShutdownInProgress()) { - // Wait a little bit before retrying background flush in - // case this is an environmental problem and we do not want to - // chew up resources for failed flushes for the duration of - // the problem. - uint64_t error_cnt = - default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); - bg_cv_.SignalAll(); // In case a waiter can proceed despite the error - mutex_.Unlock(); - ROCKS_LOG_ERROR(immutable_db_options_.info_log, - "Waiting after background flush error: %s" - "Accumulated background error counts: %" PRIu64, - s.ToString().c_str(), error_cnt); - log_buffer.FlushBufferToLog(); - LogFlush(immutable_db_options_.info_log); - env_->SleepForMicroseconds(1000000); - mutex_.Lock(); - } - - ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); - - // If flush failed, we want to delete all temporary files that we might have - // created. Thus, we force full scan in FindObsoleteFiles() - FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress()); - // delete unnecessary files if any, this is done outside the mutex - if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { - mutex_.Unlock(); - // Have to flush the info logs before bg_flush_scheduled_-- - // because if bg_flush_scheduled_ becomes 0 and the lock is - // released, the deconstructor of DB can kick in and destroy all the - // states of DB so info_log might not be available after that point. - // It also applies to access other states that DB owns. - log_buffer.FlushBufferToLog(); - if (job_context.HaveSomethingToDelete()) { - PurgeObsoleteFiles(job_context); - } - job_context.Clean(); - mutex_.Lock(); - } - - assert(num_running_flushes_ > 0); - num_running_flushes_--; - bg_flush_scheduled_--; - // See if there's more work to be done - MaybeScheduleFlushOrCompaction(); - bg_cv_.SignalAll(); - // IMPORTANT: there should be no code after calling SignalAll. This call may - // signal the DB destructor that it's OK to proceed with destruction. In - // that case, all DB variables will be dealloacated and referencing them - // will cause trouble. - } -} - -void DBImpl::BackgroundCallCompaction(void* arg) { - bool made_progress = false; - ManualCompaction* m = reinterpret_cast(arg); - JobContext job_context(next_job_id_.fetch_add(1), true); - TEST_SYNC_POINT("BackgroundCallCompaction:0"); - MaybeDumpStats(); - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, - immutable_db_options_.info_log.get()); - { - InstrumentedMutexLock l(&mutex_); - - // This call will unlock/lock the mutex to wait for current running - // IngestExternalFile() calls to finish. - WaitForIngestFile(); - - num_running_compactions_++; - - auto pending_outputs_inserted_elem = - CaptureCurrentFileNumberInPendingOutputs(); - - assert(bg_compaction_scheduled_); - Status s = - BackgroundCompaction(&made_progress, &job_context, &log_buffer, m); - TEST_SYNC_POINT("BackgroundCallCompaction:1"); - if (!s.ok() && !s.IsShutdownInProgress()) { - // Wait a little bit before retrying background compaction in - // case this is an environmental problem and we do not want to - // chew up resources for failed compactions for the duration of - // the problem. - uint64_t error_cnt = - default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); - bg_cv_.SignalAll(); // In case a waiter can proceed despite the error - mutex_.Unlock(); - log_buffer.FlushBufferToLog(); - ROCKS_LOG_ERROR(immutable_db_options_.info_log, - "Waiting after background compaction error: %s, " - "Accumulated background error counts: %" PRIu64, - s.ToString().c_str(), error_cnt); - LogFlush(immutable_db_options_.info_log); - env_->SleepForMicroseconds(1000000); - mutex_.Lock(); - } - - ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); - - // If compaction failed, we want to delete all temporary files that we might - // have created (they might not be all recorded in job_context in case of a - // failure). Thus, we force full scan in FindObsoleteFiles() - FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress()); - - // delete unnecessary files if any, this is done outside the mutex - if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { - mutex_.Unlock(); - // Have to flush the info logs before bg_compaction_scheduled_-- - // because if bg_flush_scheduled_ becomes 0 and the lock is - // released, the deconstructor of DB can kick in and destroy all the - // states of DB so info_log might not be available after that point. - // It also applies to access other states that DB owns. - log_buffer.FlushBufferToLog(); - if (job_context.HaveSomethingToDelete()) { - PurgeObsoleteFiles(job_context); - } - job_context.Clean(); - mutex_.Lock(); - } - - assert(num_running_compactions_ > 0); - num_running_compactions_--; - bg_compaction_scheduled_--; - - versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); - - // See if there's more work to be done - MaybeScheduleFlushOrCompaction(); - if (made_progress || bg_compaction_scheduled_ == 0 || - HasPendingManualCompaction()) { - // signal if - // * made_progress -- need to wakeup DelayWrite - // * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl - // * HasPendingManualCompaction -- need to wakeup RunManualCompaction - // If none of this is true, there is no need to signal since nobody is - // waiting for it - bg_cv_.SignalAll(); - } - // IMPORTANT: there should be no code after calling SignalAll. This call may - // signal the DB destructor that it's OK to proceed with destruction. In - // that case, all DB variables will be dealloacated and referencing them - // will cause trouble. - } -} - -Status DBImpl::BackgroundCompaction(bool* made_progress, - JobContext* job_context, - LogBuffer* log_buffer, void* arg) { - ManualCompaction* manual_compaction = - reinterpret_cast(arg); - *made_progress = false; - mutex_.AssertHeld(); - TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start"); - - bool is_manual = (manual_compaction != nullptr); - - // (manual_compaction->in_progress == false); - bool trivial_move_disallowed = - is_manual && manual_compaction->disallow_trivial_move; - - CompactionJobStats compaction_job_stats; - Status status = bg_error_; - if (status.ok() && shutting_down_.load(std::memory_order_acquire)) { - status = Status::ShutdownInProgress(); - } - - if (!status.ok()) { - if (is_manual) { - manual_compaction->status = status; - manual_compaction->done = true; - manual_compaction->in_progress = false; - delete manual_compaction->compaction; - manual_compaction = nullptr; - } - return status; - } - - if (is_manual) { - // another thread cannot pick up the same work - manual_compaction->in_progress = true; - } - - unique_ptr c; - // InternalKey manual_end_storage; - // InternalKey* manual_end = &manual_end_storage; - if (is_manual) { - ManualCompaction* m = manual_compaction; - assert(m->in_progress); - c.reset(std::move(m->compaction)); - if (!c) { - m->done = true; - m->manual_end = nullptr; - ROCKS_LOG_BUFFER(log_buffer, - "[%s] Manual compaction from level-%d from %s .. " - "%s; nothing to do\n", - m->cfd->GetName().c_str(), m->input_level, - (m->begin ? m->begin->DebugString().c_str() : "(begin)"), - (m->end ? m->end->DebugString().c_str() : "(end)")); - } else { - ROCKS_LOG_BUFFER( - log_buffer, - "[%s] Manual compaction from level-%d to level-%d from %s .. " - "%s; will stop at %s\n", - m->cfd->GetName().c_str(), m->input_level, c->output_level(), - (m->begin ? m->begin->DebugString().c_str() : "(begin)"), - (m->end ? m->end->DebugString().c_str() : "(end)"), - ((m->done || m->manual_end == nullptr) - ? "(end)" - : m->manual_end->DebugString().c_str())); - } - } else if (!compaction_queue_.empty()) { - // cfd is referenced here - auto cfd = PopFirstFromCompactionQueue(); - // We unreference here because the following code will take a Ref() on - // this cfd if it is going to use it (Compaction class holds a - // reference). - // This will all happen under a mutex so we don't have to be afraid of - // somebody else deleting it. - if (cfd->Unref()) { - delete cfd; - // This was the last reference of the column family, so no need to - // compact. - return Status::OK(); - } - - if (HaveManualCompaction(cfd)) { - // Can't compact right now, but try again later - TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict"); - return Status::OK(); - } - - // Pick up latest mutable CF Options and use it throughout the - // compaction job - // Compaction makes a copy of the latest MutableCFOptions. It should be used - // throughout the compaction procedure to make sure consistency. It will - // eventually be installed into SuperVersion - auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); - if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) { - // NOTE: try to avoid unnecessary copy of MutableCFOptions if - // compaction is not necessary. Need to make sure mutex is held - // until we make a copy in the following code - TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction"); - c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer)); - TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction"); - if (c != nullptr) { - // update statistics - MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION, - c->inputs(0)->size()); - // There are three things that can change compaction score: - // 1) When flush or compaction finish. This case is covered by - // InstallSuperVersionAndScheduleWork - // 2) When MutableCFOptions changes. This case is also covered by - // InstallSuperVersionAndScheduleWork, because this is when the new - // options take effect. - // 3) When we Pick a new compaction, we "remove" those files being - // compacted from the calculation, which then influences compaction - // score. Here we check if we need the new compaction even without the - // files that are currently being compacted. If we need another - // compaction, we might be able to execute it in parallel, so we add it - // to the queue and schedule a new thread. - if (cfd->NeedsCompaction()) { - // Yes, we need more compactions! - AddToCompactionQueue(cfd); - ++unscheduled_compactions_; - MaybeScheduleFlushOrCompaction(); - } - } - } - } - - if (!c) { - // Nothing to do - ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do"); - } else if (c->deletion_compaction()) { - // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old - // file if there is alive snapshot pointing to it - assert(c->num_input_files(1) == 0); - assert(c->level() == 0); - assert(c->column_family_data()->ioptions()->compaction_style == - kCompactionStyleFIFO); - - compaction_job_stats.num_input_files = c->num_input_files(0); - - for (const auto& f : *c->inputs(0)) { - c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); - } - status = versions_->LogAndApply(c->column_family_data(), - *c->mutable_cf_options(), c->edit(), - &mutex_, directories_.GetDbDir()); - InstallSuperVersionAndScheduleWorkWrapper( - c->column_family_data(), job_context, *c->mutable_cf_options()); - ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n", - c->column_family_data()->GetName().c_str(), - c->num_input_files(0)); - *made_progress = true; - } else if (!trivial_move_disallowed && c->IsTrivialMove()) { - TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove"); - // Instrument for event update - // TODO(yhchiang): add op details for showing trivial-move. - ThreadStatusUtil::SetColumnFamily( - c->column_family_data(), c->column_family_data()->ioptions()->env, - immutable_db_options_.enable_thread_tracking); - ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); - - compaction_job_stats.num_input_files = c->num_input_files(0); - - // Move files to next level - int32_t moved_files = 0; - int64_t moved_bytes = 0; - for (unsigned int l = 0; l < c->num_input_levels(); l++) { - if (c->level(l) == c->output_level()) { - continue; - } - for (size_t i = 0; i < c->num_input_files(l); i++) { - FileMetaData* f = c->input(l, i); - c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); - c->edit()->AddFile(c->output_level(), f->fd.GetNumber(), - f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, - f->largest, f->smallest_seqno, f->largest_seqno, - f->marked_for_compaction); - - ROCKS_LOG_BUFFER(log_buffer, "[%s] Moving #%" PRIu64 - " to level-%d %" PRIu64 " bytes\n", - c->column_family_data()->GetName().c_str(), - f->fd.GetNumber(), c->output_level(), - f->fd.GetFileSize()); - ++moved_files; - moved_bytes += f->fd.GetFileSize(); - } - } - - status = versions_->LogAndApply(c->column_family_data(), - *c->mutable_cf_options(), c->edit(), - &mutex_, directories_.GetDbDir()); - // Use latest MutableCFOptions - InstallSuperVersionAndScheduleWorkWrapper( - c->column_family_data(), job_context, *c->mutable_cf_options()); - - VersionStorageInfo::LevelSummaryStorage tmp; - c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(), - moved_bytes); - { - event_logger_.LogToBuffer(log_buffer) - << "job" << job_context->job_id << "event" - << "trivial_move" - << "destination_level" << c->output_level() << "files" << moved_files - << "total_files_size" << moved_bytes; - } - ROCKS_LOG_BUFFER( - log_buffer, - "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n", - c->column_family_data()->GetName().c_str(), moved_files, - c->output_level(), moved_bytes, status.ToString().c_str(), - c->column_family_data()->current()->storage_info()->LevelSummary(&tmp)); - *made_progress = true; - - // Clear Instrument - ThreadStatusUtil::ResetThreadStatus(); - } else { - int output_level __attribute__((unused)) = c->output_level(); - TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial", - &output_level); - - SequenceNumber earliest_write_conflict_snapshot; - std::vector snapshot_seqs = - snapshots_.GetAll(&earliest_write_conflict_snapshot); - - assert(is_snapshot_supported_ || snapshots_.empty()); - CompactionJob compaction_job( - job_context->job_id, c.get(), immutable_db_options_, env_options_, - versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(), - directories_.GetDataDir(c->output_path_id()), stats_, &mutex_, - &bg_error_, snapshot_seqs, earliest_write_conflict_snapshot, - table_cache_, &event_logger_, - c->mutable_cf_options()->paranoid_file_checks, - c->mutable_cf_options()->report_bg_io_stats, dbname_, - &compaction_job_stats); - compaction_job.Prepare(); - - mutex_.Unlock(); - compaction_job.Run(); - TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); - mutex_.Lock(); - - status = compaction_job.Install(*c->mutable_cf_options()); - if (status.ok()) { - InstallSuperVersionAndScheduleWorkWrapper( - c->column_family_data(), job_context, *c->mutable_cf_options()); - } - *made_progress = true; - } - if (c != nullptr) { - c->ReleaseCompactionFiles(status); - *made_progress = true; - NotifyOnCompactionCompleted( - c->column_family_data(), c.get(), status, - compaction_job_stats, job_context->job_id); - } - // this will unref its input_version and column_family_data - c.reset(); - - if (status.ok()) { - // Done - } else if (status.IsShutdownInProgress()) { - // Ignore compaction errors found during shutting down - } else { - ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s", - status.ToString().c_str()); - if (immutable_db_options_.paranoid_checks && bg_error_.ok()) { - bg_error_ = status; - } - } - - if (is_manual) { - ManualCompaction* m = manual_compaction; - if (!status.ok()) { - m->status = status; - m->done = true; - } - // For universal compaction: - // Because universal compaction always happens at level 0, so one - // compaction will pick up all overlapped files. No files will be - // filtered out due to size limit and left for a successive compaction. - // So we can safely conclude the current compaction. - // - // Also note that, if we don't stop here, then the current compaction - // writes a new file back to level 0, which will be used in successive - // compaction. Hence the manual compaction will never finish. - // - // Stop the compaction if manual_end points to nullptr -- this means - // that we compacted the whole range. manual_end should always point - // to nullptr in case of universal compaction - if (m->manual_end == nullptr) { - m->done = true; - } - if (!m->done) { - // We only compacted part of the requested range. Update *m - // to the range that is left to be compacted. - // Universal and FIFO compactions should always compact the whole range - assert(m->cfd->ioptions()->compaction_style != - kCompactionStyleUniversal || - m->cfd->ioptions()->num_levels > 1); - assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO); - m->tmp_storage = *m->manual_end; - m->begin = &m->tmp_storage; - m->incomplete = true; - } - m->in_progress = false; // not being processed anymore - } - TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish"); - return status; -} - -bool DBImpl::HasPendingManualCompaction() { - return (!manual_compaction_dequeue_.empty()); -} - -void DBImpl::AddManualCompaction(DBImpl::ManualCompaction* m) { - manual_compaction_dequeue_.push_back(m); -} - -void DBImpl::RemoveManualCompaction(DBImpl::ManualCompaction* m) { - // Remove from queue - std::deque::iterator it = - manual_compaction_dequeue_.begin(); - while (it != manual_compaction_dequeue_.end()) { - if (m == (*it)) { - it = manual_compaction_dequeue_.erase(it); - return; - } - it++; - } - assert(false); - return; -} - -bool DBImpl::ShouldntRunManualCompaction(ManualCompaction* m) { - if (num_running_ingest_file_ > 0) { - // We need to wait for other IngestExternalFile() calls to finish - // before running a manual compaction. - return true; - } - if (m->exclusive) { - return (bg_compaction_scheduled_ > 0); - } - std::deque::iterator it = - manual_compaction_dequeue_.begin(); - bool seen = false; - while (it != manual_compaction_dequeue_.end()) { - if (m == (*it)) { - it++; - seen = true; - continue; - } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) { - // Consider the other manual compaction *it, conflicts if: - // overlaps with m - // and (*it) is ahead in the queue and is not yet in progress - return true; - } - it++; - } - return false; -} - -bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) { - // Remove from priority queue - std::deque::iterator it = - manual_compaction_dequeue_.begin(); - while (it != manual_compaction_dequeue_.end()) { - if ((*it)->exclusive) { - return true; - } - if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) { - // Allow automatic compaction if manual compaction is - // is in progress - return true; - } - it++; - } - return false; -} - -bool DBImpl::HasExclusiveManualCompaction() { - // Remove from priority queue - std::deque::iterator it = - manual_compaction_dequeue_.begin(); - while (it != manual_compaction_dequeue_.end()) { - if ((*it)->exclusive) { - return true; - } - it++; - } - return false; -} - -bool DBImpl::MCOverlap(ManualCompaction* m, ManualCompaction* m1) { - if ((m->exclusive) || (m1->exclusive)) { - return true; - } - if (m->cfd != m1->cfd) { - return false; - } - return true; -} - -size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const { - mutex_.AssertHeld(); - size_t bsize = write_buffer_size / 10 + write_buffer_size; - // Some users might set very high write_buffer_size and rely on - // max_total_wal_size or other parameters to control the WAL size. - if (mutable_db_options_.max_total_wal_size > 0) { - bsize = std::min(bsize, mutable_db_options_.max_total_wal_size); - } - if (immutable_db_options_.db_write_buffer_size > 0) { - bsize = std::min(bsize, immutable_db_options_.db_write_buffer_size); - } - if (immutable_db_options_.write_buffer_manager && - immutable_db_options_.write_buffer_manager->enabled()) { - bsize = std::min( - bsize, immutable_db_options_.write_buffer_manager->buffer_size()); - } - - return bsize; -} - -namespace { -struct IterState { - IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version, - bool _background_purge) - : db(_db), - mu(_mu), - super_version(_super_version), - background_purge(_background_purge) {} - - DBImpl* db; - InstrumentedMutex* mu; - SuperVersion* super_version; - bool background_purge; -}; - -static void CleanupIteratorState(void* arg1, void* arg2) { - IterState* state = reinterpret_cast(arg1); - - if (state->super_version->Unref()) { - // Job id == 0 means that this is not our background process, but rather - // user thread - JobContext job_context(0); - - state->mu->Lock(); - state->super_version->Cleanup(); - state->db->FindObsoleteFiles(&job_context, false, true); - if (state->background_purge) { - state->db->ScheduleBgLogWriterClose(&job_context); - } - state->mu->Unlock(); - - delete state->super_version; - if (job_context.HaveSomethingToDelete()) { - if (state->background_purge) { - // PurgeObsoleteFiles here does not delete files. Instead, it adds the - // files to be deleted to a job queue, and deletes it in a separate - // background thread. - state->db->PurgeObsoleteFiles(job_context, true /* schedule only */); - state->mu->Lock(); - state->db->SchedulePurge(); - state->mu->Unlock(); - } else { - state->db->PurgeObsoleteFiles(job_context); - } - } - job_context.Clean(); - } - - delete state; -} -} // namespace - -InternalIterator* DBImpl::NewInternalIterator( - const ReadOptions& read_options, ColumnFamilyData* cfd, - SuperVersion* super_version, Arena* arena, - RangeDelAggregator* range_del_agg) { - InternalIterator* internal_iter; - assert(arena != nullptr); - assert(range_del_agg != nullptr); - // Need to create internal iterator from the arena. - MergeIteratorBuilder merge_iter_builder( - &cfd->internal_comparator(), arena, - !read_options.total_order_seek && - cfd->ioptions()->prefix_extractor != nullptr); - // Collect iterator for mutable mem - merge_iter_builder.AddIterator( - super_version->mem->NewIterator(read_options, arena)); - std::unique_ptr range_del_iter; - Status s; - if (!read_options.ignore_range_deletions) { - range_del_iter.reset( - super_version->mem->NewRangeTombstoneIterator(read_options)); - s = range_del_agg->AddTombstones(std::move(range_del_iter)); - } - // Collect all needed child iterators for immutable memtables - if (s.ok()) { - super_version->imm->AddIterators(read_options, &merge_iter_builder); - if (!read_options.ignore_range_deletions) { - s = super_version->imm->AddRangeTombstoneIterators(read_options, arena, - range_del_agg); - } - } - if (s.ok()) { - // Collect iterators for files in L0 - Ln - if (read_options.read_tier != kMemtableTier) { - super_version->current->AddIterators(read_options, env_options_, - &merge_iter_builder, range_del_agg); - } - internal_iter = merge_iter_builder.Finish(); - IterState* cleanup = - new IterState(this, &mutex_, super_version, - read_options.background_purge_on_iterator_cleanup); - internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); - - return internal_iter; - } - return NewErrorInternalIterator(s); -} - -ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { - return default_cf_handle_; -} - -Status DBImpl::Get(const ReadOptions& read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) { - return GetImpl(read_options, column_family, key, value); -} - -// JobContext gets created and destructed outside of the lock -- -// we -// use this convinently to: -// * malloc one SuperVersion() outside of the lock -- new_superversion -// * delete SuperVersion()s outside of the lock -- superversions_to_free -// -// However, if InstallSuperVersionAndScheduleWork() gets called twice with the -// same job_context, we can't reuse the SuperVersion() that got -// malloced because -// first call already used it. In that rare case, we take a hit and create a -// new SuperVersion() inside of the mutex. We do similar thing -// for superversion_to_free -void DBImpl::InstallSuperVersionAndScheduleWorkWrapper( - ColumnFamilyData* cfd, JobContext* job_context, - const MutableCFOptions& mutable_cf_options) { - mutex_.AssertHeld(); - SuperVersion* old_superversion = InstallSuperVersionAndScheduleWork( - cfd, job_context->new_superversion, mutable_cf_options); - job_context->new_superversion = nullptr; - job_context->superversions_to_free.push_back(old_superversion); -} - -SuperVersion* DBImpl::InstallSuperVersionAndScheduleWork( - ColumnFamilyData* cfd, SuperVersion* new_sv, - const MutableCFOptions& mutable_cf_options) { - mutex_.AssertHeld(); - - // Update max_total_in_memory_state_ - size_t old_memtable_size = 0; - auto* old_sv = cfd->GetSuperVersion(); - if (old_sv) { - old_memtable_size = old_sv->mutable_cf_options.write_buffer_size * - old_sv->mutable_cf_options.max_write_buffer_number; - } - - auto* old = cfd->InstallSuperVersion( - new_sv ? new_sv : new SuperVersion(), &mutex_, mutable_cf_options); - - // Whenever we install new SuperVersion, we might need to issue new flushes or - // compactions. - SchedulePendingFlush(cfd); - SchedulePendingCompaction(cfd); - MaybeScheduleFlushOrCompaction(); - - // Update max_total_in_memory_state_ - max_total_in_memory_state_ = - max_total_in_memory_state_ - old_memtable_size + - mutable_cf_options.write_buffer_size * - mutable_cf_options.max_write_buffer_number; - return old; -} - -Status DBImpl::GetImpl(const ReadOptions& read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* pinnable_val, bool* value_found) { - assert(pinnable_val != nullptr); - StopWatch sw(env_, stats_, DB_GET); - PERF_TIMER_GUARD(get_snapshot_time); - - auto cfh = reinterpret_cast(column_family); - auto cfd = cfh->cfd(); - - // Acquire SuperVersion - SuperVersion* sv = GetAndRefSuperVersion(cfd); - - TEST_SYNC_POINT("DBImpl::GetImpl:1"); - TEST_SYNC_POINT("DBImpl::GetImpl:2"); - - SequenceNumber snapshot; - if (read_options.snapshot != nullptr) { - snapshot = reinterpret_cast( - read_options.snapshot)->number_; - } else { - // Since we get and reference the super version before getting - // the snapshot number, without a mutex protection, it is possible - // that a memtable switch happened in the middle and not all the - // data for this snapshot is available. But it will contain all - // the data available in the super version we have, which is also - // a valid snapshot to read from. - // We shouldn't get snapshot before finding and referencing the - // super versipon because a flush happening in between may compact - // away data for the snapshot, but the snapshot is earlier than the - // data overwriting it, so users may see wrong results. - snapshot = versions_->LastSequence(); - } - TEST_SYNC_POINT("DBImpl::GetImpl:3"); - TEST_SYNC_POINT("DBImpl::GetImpl:4"); - - // Prepare to store a list of merge operations if merge occurs. - MergeContext merge_context; - RangeDelAggregator range_del_agg(cfd->internal_comparator(), snapshot); - - Status s; - // First look in the memtable, then in the immutable memtable (if any). - // s is both in/out. When in, s could either be OK or MergeInProgress. - // merge_operands will contain the sequence of merges in the latter case. - LookupKey lkey(key, snapshot); - PERF_TIMER_STOP(get_snapshot_time); - - bool skip_memtable = (read_options.read_tier == kPersistedTier && - has_unpersisted_data_.load(std::memory_order_relaxed)); - bool done = false; - if (!skip_memtable) { - if (sv->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, - &range_del_agg, read_options)) { - done = true; - pinnable_val->PinSelf(); - RecordTick(stats_, MEMTABLE_HIT); - } else if ((s.ok() || s.IsMergeInProgress()) && - sv->imm->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, - &range_del_agg, read_options)) { - done = true; - pinnable_val->PinSelf(); - RecordTick(stats_, MEMTABLE_HIT); - } - if (!done && !s.ok() && !s.IsMergeInProgress()) { - return s; - } - } - if (!done) { - PERF_TIMER_GUARD(get_from_output_files_time); - sv->current->Get(read_options, lkey, pinnable_val, &s, &merge_context, - &range_del_agg, value_found); - RecordTick(stats_, MEMTABLE_MISS); - } - - { - PERF_TIMER_GUARD(get_post_process_time); - - ReturnAndCleanupSuperVersion(cfd, sv); - - RecordTick(stats_, NUMBER_KEYS_READ); - size_t size = pinnable_val->size(); - RecordTick(stats_, BYTES_READ, size); - MeasureTime(stats_, BYTES_PER_READ, size); - } - return s; -} - -std::vector DBImpl::MultiGet( - const ReadOptions& read_options, - const std::vector& column_family, - const std::vector& keys, std::vector* values) { - - StopWatch sw(env_, stats_, DB_MULTIGET); - PERF_TIMER_GUARD(get_snapshot_time); - - SequenceNumber snapshot; - - struct MultiGetColumnFamilyData { - ColumnFamilyData* cfd; - SuperVersion* super_version; - }; - std::unordered_map multiget_cf_data; - // fill up and allocate outside of mutex - for (auto cf : column_family) { - auto cfh = reinterpret_cast(cf); - auto cfd = cfh->cfd(); - if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) { - auto mgcfd = new MultiGetColumnFamilyData(); - mgcfd->cfd = cfd; - multiget_cf_data.insert({cfd->GetID(), mgcfd}); - } - } - - mutex_.Lock(); - if (read_options.snapshot != nullptr) { - snapshot = reinterpret_cast( - read_options.snapshot)->number_; - } else { - snapshot = versions_->LastSequence(); - } - for (auto mgd_iter : multiget_cf_data) { - mgd_iter.second->super_version = - mgd_iter.second->cfd->GetSuperVersion()->Ref(); - } - mutex_.Unlock(); - - // Contain a list of merge operations if merge occurs. - MergeContext merge_context; - - // Note: this always resizes the values array - size_t num_keys = keys.size(); - std::vector stat_list(num_keys); - values->resize(num_keys); - - // Keep track of bytes that we read for statistics-recording later - uint64_t bytes_read = 0; - PERF_TIMER_STOP(get_snapshot_time); - - // For each of the given keys, apply the entire "get" process as follows: - // First look in the memtable, then in the immutable memtable (if any). - // s is both in/out. When in, s could either be OK or MergeInProgress. - // merge_operands will contain the sequence of merges in the latter case. - for (size_t i = 0; i < num_keys; ++i) { - merge_context.Clear(); - Status& s = stat_list[i]; - std::string* value = &(*values)[i]; - - LookupKey lkey(keys[i], snapshot); - auto cfh = reinterpret_cast(column_family[i]); - RangeDelAggregator range_del_agg(cfh->cfd()->internal_comparator(), - snapshot); - auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); - assert(mgd_iter != multiget_cf_data.end()); - auto mgd = mgd_iter->second; - auto super_version = mgd->super_version; - bool skip_memtable = - (read_options.read_tier == kPersistedTier && - has_unpersisted_data_.load(std::memory_order_relaxed)); - bool done = false; - if (!skip_memtable) { - if (super_version->mem->Get(lkey, value, &s, &merge_context, - &range_del_agg, read_options)) { - done = true; - // TODO(?): RecordTick(stats_, MEMTABLE_HIT)? - } else if (super_version->imm->Get(lkey, value, &s, &merge_context, - &range_del_agg, read_options)) { - done = true; - // TODO(?): RecordTick(stats_, MEMTABLE_HIT)? - } - } - if (!done) { - PinnableSlice pinnable_val; - PERF_TIMER_GUARD(get_from_output_files_time); - super_version->current->Get(read_options, lkey, &pinnable_val, &s, - &merge_context, &range_del_agg); - value->assign(pinnable_val.data(), pinnable_val.size()); - // TODO(?): RecordTick(stats_, MEMTABLE_MISS)? - } - - if (s.ok()) { - bytes_read += value->size(); - } - } - - // Post processing (decrement reference counts and record statistics) - PERF_TIMER_GUARD(get_post_process_time); - autovector superversions_to_delete; - - // TODO(icanadi) do we need lock here or just around Cleanup()? - mutex_.Lock(); - for (auto mgd_iter : multiget_cf_data) { - auto mgd = mgd_iter.second; - if (mgd->super_version->Unref()) { - mgd->super_version->Cleanup(); - superversions_to_delete.push_back(mgd->super_version); - } - } - mutex_.Unlock(); - - for (auto td : superversions_to_delete) { - delete td; - } - for (auto mgd : multiget_cf_data) { - delete mgd.second; - } - - RecordTick(stats_, NUMBER_MULTIGET_CALLS); - RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); - RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); - MeasureTime(stats_, BYTES_PER_MULTIGET, bytes_read); - PERF_TIMER_STOP(get_post_process_time); - - return stat_list; -} - -Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, - const std::string& column_family_name, - ColumnFamilyHandle** handle) { - Status s; - Status persist_options_status; - *handle = nullptr; - - s = CheckCompressionSupported(cf_options); - if (s.ok() && immutable_db_options_.allow_concurrent_memtable_write) { - s = CheckConcurrentWritesSupported(cf_options); - } - if (!s.ok()) { - return s; - } - - { - InstrumentedMutexLock l(&mutex_); - - if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) != - nullptr) { - return Status::InvalidArgument("Column family already exists"); - } - VersionEdit edit; - edit.AddColumnFamily(column_family_name); - uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); - edit.SetColumnFamily(new_id); - edit.SetLogNumber(logfile_number_); - edit.SetComparatorName(cf_options.comparator->Name()); - - // LogAndApply will both write the creation in MANIFEST and create - // ColumnFamilyData object - { // write thread - WriteThread::Writer w; - write_thread_.EnterUnbatched(&w, &mutex_); - // LogAndApply will both write the creation in MANIFEST and create - // ColumnFamilyData object - s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit, - &mutex_, directories_.GetDbDir(), false, - &cf_options); - - if (s.ok()) { - // If the column family was created successfully, we then persist - // the updated RocksDB options under the same single write thread - persist_options_status = WriteOptionsFile(); - } - write_thread_.ExitUnbatched(&w); - } - if (s.ok()) { - single_column_family_mode_ = false; - auto* cfd = - versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); - assert(cfd != nullptr); - delete InstallSuperVersionAndScheduleWork( - cfd, nullptr, *cfd->GetLatestMutableCFOptions()); - - if (!cfd->mem()->IsSnapshotSupported()) { - is_snapshot_supported_ = false; - } - - *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "Created column family [%s] (ID %u)", - column_family_name.c_str(), (unsigned)cfd->GetID()); - } else { - ROCKS_LOG_ERROR(immutable_db_options_.info_log, - "Creating column family [%s] FAILED -- %s", - column_family_name.c_str(), s.ToString().c_str()); - } - } // InstrumentedMutexLock l(&mutex_) - - // this is outside the mutex - if (s.ok()) { - NewThreadStatusCfInfo( - reinterpret_cast(*handle)->cfd()); - if (!persist_options_status.ok()) { - if (immutable_db_options_.fail_if_options_file_error) { - s = Status::IOError( - "ColumnFamily has been created, but unable to persist" - "options in CreateColumnFamily()", - persist_options_status.ToString().c_str()); - } - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "Unable to persist options in CreateColumnFamily() -- %s", - persist_options_status.ToString().c_str()); - } - } - return s; -} - -Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { - auto cfh = reinterpret_cast(column_family); - auto cfd = cfh->cfd(); - if (cfd->GetID() == 0) { - return Status::InvalidArgument("Can't drop default column family"); - } - - bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported(); - - VersionEdit edit; - edit.DropColumnFamily(); - edit.SetColumnFamily(cfd->GetID()); - - Status s; - Status options_persist_status; - { - InstrumentedMutexLock l(&mutex_); - if (cfd->IsDropped()) { - s = Status::InvalidArgument("Column family already dropped!\n"); - } - if (s.ok()) { - // we drop column family from a single write thread - WriteThread::Writer w; - write_thread_.EnterUnbatched(&w, &mutex_); - s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_); - if (s.ok()) { - // If the column family was dropped successfully, we then persist - // the updated RocksDB options under the same single write thread - options_persist_status = WriteOptionsFile(); - } - write_thread_.ExitUnbatched(&w); - } - - if (!cf_support_snapshot) { - // Dropped Column Family doesn't support snapshot. Need to recalculate - // is_snapshot_supported_. - bool new_is_snapshot_supported = true; - for (auto c : *versions_->GetColumnFamilySet()) { - if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) { - new_is_snapshot_supported = false; - break; - } - } - is_snapshot_supported_ = new_is_snapshot_supported; - } - } - - if (s.ok()) { - // Note that here we erase the associated cf_info of the to-be-dropped - // cfd before its ref-count goes to zero to avoid having to erase cf_info - // later inside db_mutex. - EraseThreadStatusCfInfo(cfd); - assert(cfd->IsDropped()); - auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); - max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size * - mutable_cf_options->max_write_buffer_number; - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "Dropped column family with id %u\n", cfd->GetID()); - - if (!options_persist_status.ok()) { - if (immutable_db_options_.fail_if_options_file_error) { - s = Status::IOError( - "ColumnFamily has been dropped, but unable to persist " - "options in DropColumnFamily()", - options_persist_status.ToString().c_str()); - } - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "Unable to persist options in DropColumnFamily() -- %s", - options_persist_status.ToString().c_str()); - } - } else { - ROCKS_LOG_ERROR(immutable_db_options_.info_log, - "Dropping column family with id %u FAILED -- %s\n", - cfd->GetID(), s.ToString().c_str()); - } - - return s; -} - -bool DBImpl::KeyMayExist(const ReadOptions& read_options, - ColumnFamilyHandle* column_family, const Slice& key, - std::string* value, bool* value_found) { - assert(value != nullptr); - if (value_found != nullptr) { - // falsify later if key-may-exist but can't fetch value - *value_found = true; - } - ReadOptions roptions = read_options; - roptions.read_tier = kBlockCacheTier; // read from block cache only - PinnableSlice pinnable_val; - auto s = GetImpl(roptions, column_family, key, &pinnable_val, value_found); - value->assign(pinnable_val.data(), pinnable_val.size()); - - // If block_cache is enabled and the index block of the table didn't - // not present in block_cache, the return value will be Status::Incomplete. - // In this case, key may still exist in the table. - return s.ok() || s.IsIncomplete(); -} - -Iterator* DBImpl::NewIterator(const ReadOptions& read_options, - ColumnFamilyHandle* column_family) { - if (read_options.read_tier == kPersistedTier) { - return NewErrorIterator(Status::NotSupported( - "ReadTier::kPersistedData is not yet supported in iterators.")); - } - auto cfh = reinterpret_cast(column_family); - auto cfd = cfh->cfd(); - if (read_options.managed) { -#ifdef ROCKSDB_LITE - // not supported in lite version - return NewErrorIterator(Status::InvalidArgument( - "Managed Iterators not supported in RocksDBLite.")); -#else - if ((read_options.tailing) || (read_options.snapshot != nullptr) || - (is_snapshot_supported_)) { - return new ManagedIterator(this, read_options, cfd); - } - // Managed iter not supported - return NewErrorIterator(Status::InvalidArgument( - "Managed Iterators not supported without snapshots.")); -#endif - } else if (read_options.tailing) { -#ifdef ROCKSDB_LITE - // not supported in lite version - return nullptr; -#else - SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); - auto iter = new ForwardIterator(this, read_options, cfd, sv); - return NewDBIterator( - env_, *cfd->ioptions(), cfd->user_comparator(), iter, - kMaxSequenceNumber, - sv->mutable_cf_options.max_sequential_skip_in_iterations, - sv->version_number, read_options.iterate_upper_bound, - read_options.prefix_same_as_start, read_options.pin_data, - read_options.total_order_seek, - read_options.max_skippable_internal_keys); -#endif - } else { - SequenceNumber latest_snapshot = versions_->LastSequence(); - SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); - - auto snapshot = - read_options.snapshot != nullptr - ? reinterpret_cast( - read_options.snapshot)->number_ - : latest_snapshot; - - // Try to generate a DB iterator tree in continuous memory area to be - // cache friendly. Here is an example of result: - // +-------------------------------+ - // | | - // | ArenaWrappedDBIter | - // | + | - // | +---> Inner Iterator ------------+ - // | | | | - // | | +-- -- -- -- -- -- -- --+ | - // | +--- | Arena | | - // | | | | - // | Allocated Memory: | | - // | | +-------------------+ | - // | | | DBIter | <---+ - // | | + | - // | | | +-> iter_ ------------+ - // | | | | | - // | | +-------------------+ | - // | | | MergingIterator | <---+ - // | | + | - // | | | +->child iter1 ------------+ - // | | | | | | - // | | +->child iter2 ----------+ | - // | | | | | | | - // | | | +->child iter3 --------+ | | - // | | | | | | - // | | +-------------------+ | | | - // | | | Iterator1 | <--------+ - // | | +-------------------+ | | - // | | | Iterator2 | <------+ - // | | +-------------------+ | - // | | | Iterator3 | <----+ - // | | +-------------------+ - // | | | - // +-------+-----------------------+ - // - // ArenaWrappedDBIter inlines an arena area where all the iterators in - // the iterator tree are allocated in the order of being accessed when - // querying. - // Laying out the iterators in the order of being accessed makes it more - // likely that any iterator pointer is close to the iterator it points to so - // that they are likely to be in the same cache line and/or page. - ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( - env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, - sv->mutable_cf_options.max_sequential_skip_in_iterations, - sv->version_number, read_options.iterate_upper_bound, - read_options.prefix_same_as_start, read_options.pin_data, - read_options.total_order_seek, - read_options.max_skippable_internal_keys); - - InternalIterator* internal_iter = - NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(), - db_iter->GetRangeDelAggregator()); - db_iter->SetIterUnderDBIter(internal_iter); - - return db_iter; - } - // To stop compiler from complaining - return nullptr; -} - -Status DBImpl::NewIterators( - const ReadOptions& read_options, - const std::vector& column_families, - std::vector* iterators) { - if (read_options.read_tier == kPersistedTier) { - return Status::NotSupported( - "ReadTier::kPersistedData is not yet supported in iterators."); - } - iterators->clear(); - iterators->reserve(column_families.size()); - if (read_options.managed) { -#ifdef ROCKSDB_LITE - return Status::InvalidArgument( - "Managed interator not supported in RocksDB lite"); -#else - if ((!read_options.tailing) && (read_options.snapshot == nullptr) && - (!is_snapshot_supported_)) { - return Status::InvalidArgument( - "Managed interator not supported without snapshots"); - } - for (auto cfh : column_families) { - auto cfd = reinterpret_cast(cfh)->cfd(); - auto iter = new ManagedIterator(this, read_options, cfd); - iterators->push_back(iter); - } -#endif - } else if (read_options.tailing) { -#ifdef ROCKSDB_LITE - return Status::InvalidArgument( - "Tailing interator not supported in RocksDB lite"); -#else - for (auto cfh : column_families) { - auto cfd = reinterpret_cast(cfh)->cfd(); - SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); - auto iter = new ForwardIterator(this, read_options, cfd, sv); - iterators->push_back(NewDBIterator( - env_, *cfd->ioptions(), cfd->user_comparator(), iter, - kMaxSequenceNumber, - sv->mutable_cf_options.max_sequential_skip_in_iterations, - sv->version_number, nullptr, false, read_options.pin_data, - read_options.total_order_seek, - read_options.max_skippable_internal_keys)); - } -#endif - } else { - SequenceNumber latest_snapshot = versions_->LastSequence(); - - for (size_t i = 0; i < column_families.size(); ++i) { - auto* cfd = reinterpret_cast( - column_families[i])->cfd(); - SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); - - auto snapshot = - read_options.snapshot != nullptr - ? reinterpret_cast( - read_options.snapshot)->number_ - : latest_snapshot; - - ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( - env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, - sv->mutable_cf_options.max_sequential_skip_in_iterations, - sv->version_number, nullptr, false, read_options.pin_data, - read_options.total_order_seek, - read_options.max_skippable_internal_keys); - InternalIterator* internal_iter = - NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(), - db_iter->GetRangeDelAggregator()); - db_iter->SetIterUnderDBIter(internal_iter); - iterators->push_back(db_iter); + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + if (read_options.managed) { +#ifdef ROCKSDB_LITE + // not supported in lite version + return NewErrorIterator(Status::InvalidArgument( + "Managed Iterators not supported in RocksDBLite.")); +#else + if ((read_options.tailing) || (read_options.snapshot != nullptr) || + (is_snapshot_supported_)) { + return new ManagedIterator(this, read_options, cfd); } - } - - return Status::OK(); -} - -const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); } - -#ifndef ROCKSDB_LITE -const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() { - return GetSnapshotImpl(true); -} -#endif // ROCKSDB_LITE - -const Snapshot* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary) { - int64_t unix_time = 0; - env_->GetCurrentTime(&unix_time); // Ignore error - SnapshotImpl* s = new SnapshotImpl; - - InstrumentedMutexLock l(&mutex_); - // returns null if the underlying memtable does not support snapshot. - if (!is_snapshot_supported_) { - delete s; + // Managed iter not supported + return NewErrorIterator(Status::InvalidArgument( + "Managed Iterators not supported without snapshots.")); +#endif + } else if (read_options.tailing) { +#ifdef ROCKSDB_LITE + // not supported in lite version return nullptr; - } - return snapshots_.New(s, versions_->LastSequence(), unix_time, - is_write_conflict_boundary); -} - -void DBImpl::ReleaseSnapshot(const Snapshot* s) { - const SnapshotImpl* casted_s = reinterpret_cast(s); - { - InstrumentedMutexLock l(&mutex_); - snapshots_.Delete(casted_s); - } - delete casted_s; -} - -// Convenience methods -Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, - const Slice& key, const Slice& val) { - return DB::Put(o, column_family, key, val); -} - -Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, - const Slice& key, const Slice& val) { - auto cfh = reinterpret_cast(column_family); - if (!cfh->cfd()->ioptions()->merge_operator) { - return Status::NotSupported("Provide a merge_operator when opening DB"); +#else + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); + auto iter = new ForwardIterator(this, read_options, cfd, sv); + return NewDBIterator( + env_, *cfd->ioptions(), cfd->user_comparator(), iter, + kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_options.iterate_upper_bound, + read_options.prefix_same_as_start, read_options.pin_data, + read_options.total_order_seek, + read_options.max_skippable_internal_keys); +#endif } else { - return DB::Merge(o, column_family, key, val); - } -} - -Status DBImpl::Delete(const WriteOptions& write_options, - ColumnFamilyHandle* column_family, const Slice& key) { - return DB::Delete(write_options, column_family, key); -} - -Status DBImpl::SingleDelete(const WriteOptions& write_options, - ColumnFamilyHandle* column_family, - const Slice& key) { - return DB::SingleDelete(write_options, column_family, key); -} - -Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { - return WriteImpl(write_options, my_batch, nullptr, nullptr); -} - -#ifndef ROCKSDB_LITE -Status DBImpl::WriteWithCallback(const WriteOptions& write_options, - WriteBatch* my_batch, - WriteCallback* callback) { - return WriteImpl(write_options, my_batch, callback, nullptr); -} -#endif // ROCKSDB_LITE - -Status DBImpl::WriteImpl(const WriteOptions& write_options, - WriteBatch* my_batch, WriteCallback* callback, - uint64_t* log_used, uint64_t log_ref, - bool disable_memtable) { - if (my_batch == nullptr) { - return Status::Corruption("Batch is nullptr!"); - } - - Status status; - - PERF_TIMER_GUARD(write_pre_and_post_process_time); - WriteThread::Writer w(write_options, my_batch, callback, log_ref, - disable_memtable); - - if (!write_options.disableWAL) { - RecordTick(stats_, WRITE_WITH_WAL); - } - - StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); - - write_thread_.JoinBatchGroup(&w); - if (w.state == WriteThread::STATE_PARALLEL_FOLLOWER) { - // we are a non-leader in a parallel group - PERF_TIMER_GUARD(write_memtable_time); - - if (w.ShouldWriteToMemtable()) { - ColumnFamilyMemTablesImpl column_family_memtables( - versions_->GetColumnFamilySet()); - WriteBatchInternal::SetSequence(w.batch, w.sequence); - w.status = WriteBatchInternal::InsertInto( - &w, &column_family_memtables, &flush_scheduler_, - write_options.ignore_missing_column_families, 0 /*log_number*/, this, - true /*concurrent_memtable_writes*/); - } - - if (write_thread_.CompleteParallelWorker(&w)) { - // we're responsible for early exit - auto last_sequence = w.parallel_group->last_sequence; - versions_->SetLastSequence(last_sequence); - write_thread_.EarlyExitParallelGroup(&w); - } - assert(w.state == WriteThread::STATE_COMPLETED); - // STATE_COMPLETED conditional below handles exit - - status = w.FinalStatus(); - } - if (w.state == WriteThread::STATE_COMPLETED) { - if (log_used != nullptr) { - *log_used = w.log_used; - } - // write is complete and leader has updated sequence - return w.FinalStatus(); - } - // else we are the leader of the write batch group - assert(w.state == WriteThread::STATE_GROUP_LEADER); - - // Once reaches this point, the current writer "w" will try to do its write - // job. It may also pick up some of the remaining writers in the "writers_" - // when it finds suitable, and finish them in the same write batch. - // This is how a write job could be done by the other writer. - WriteContext write_context; - WriteThread::Writer* last_writer = &w; - autovector write_group; - bool logs_getting_synced = false; - - mutex_.Lock(); - - bool need_log_sync = !write_options.disableWAL && write_options.sync; - bool need_log_dir_sync = need_log_sync && !log_dir_synced_; - status = PreprocessWrite(write_options, need_log_sync, &logs_getting_synced, - &write_context); - uint64_t last_sequence = versions_->LastSequence(); - log::Writer* cur_log_writer = logs_.back().writer; - - mutex_.Unlock(); - - // Add to log and apply to memtable. We can release the lock - // during this phase since &w is currently responsible for logging - // and protects against concurrent loggers and concurrent writes - // into memtables + SequenceNumber latest_snapshot = versions_->LastSequence(); + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); - bool exit_completed_early = false; - last_batch_group_size_ = - write_thread_.EnterAsBatchGroupLeader(&w, &last_writer, &write_group); + auto snapshot = + read_options.snapshot != nullptr + ? reinterpret_cast( + read_options.snapshot)->number_ + : latest_snapshot; - if (status.ok()) { - // Rules for when we can update the memtable concurrently - // 1. supported by memtable - // 2. Puts are not okay if inplace_update_support - // 3. Deletes or SingleDeletes are not okay if filtering deletes - // (controlled by both batch and memtable setting) - // 4. Merges are not okay + // Try to generate a DB iterator tree in continuous memory area to be + // cache friendly. Here is an example of result: + // +-------------------------------+ + // | | + // | ArenaWrappedDBIter | + // | + | + // | +---> Inner Iterator ------------+ + // | | | | + // | | +-- -- -- -- -- -- -- --+ | + // | +--- | Arena | | + // | | | | + // | Allocated Memory: | | + // | | +-------------------+ | + // | | | DBIter | <---+ + // | | + | + // | | | +-> iter_ ------------+ + // | | | | | + // | | +-------------------+ | + // | | | MergingIterator | <---+ + // | | + | + // | | | +->child iter1 ------------+ + // | | | | | | + // | | +->child iter2 ----------+ | + // | | | | | | | + // | | | +->child iter3 --------+ | | + // | | | | | | + // | | +-------------------+ | | | + // | | | Iterator1 | <--------+ + // | | +-------------------+ | | + // | | | Iterator2 | <------+ + // | | +-------------------+ | + // | | | Iterator3 | <----+ + // | | +-------------------+ + // | | | + // +-------+-----------------------+ // - // Rules 1..3 are enforced by checking the options - // during startup (CheckConcurrentWritesSupported), so if - // options.allow_concurrent_memtable_write is true then they can be - // assumed to be true. Rule 4 is checked for each batch. We could - // relax rules 2 and 3 if we could prevent write batches from referring - // more than once to a particular key. - bool parallel = immutable_db_options_.allow_concurrent_memtable_write && - write_group.size() > 1; - int total_count = 0; - uint64_t total_byte_size = 0; - for (auto writer : write_group) { - if (writer->CheckCallback(this)) { - if (writer->ShouldWriteToMemtable()) { - total_count += WriteBatchInternal::Count(writer->batch); - parallel = parallel && !writer->batch->HasMerge(); - } - - if (writer->ShouldWriteToWAL()) { - total_byte_size = WriteBatchInternal::AppendedByteSize( - total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); - } - } - } - - const SequenceNumber current_sequence = last_sequence + 1; - last_sequence += total_count; - - // Update stats while we are an exclusive group leader, so we know - // that nobody else can be writing to these particular stats. - // We're optimistic, updating the stats before we successfully - // commit. That lets us release our leader status early in - // some cases. - auto stats = default_cf_internal_stats_; - stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count); - RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); - stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size); - RecordTick(stats_, BYTES_WRITTEN, total_byte_size); - stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1); - RecordTick(stats_, WRITE_DONE_BY_SELF); - auto write_done_by_other = write_group.size() - 1; - if (write_done_by_other > 0) { - stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, - write_done_by_other); - RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other); - } - MeasureTime(stats_, BYTES_PER_WRITE, total_byte_size); - - if (write_options.disableWAL) { - has_unpersisted_data_.store(true, std::memory_order_relaxed); - } - - PERF_TIMER_STOP(write_pre_and_post_process_time); - - if (status.ok() && !write_options.disableWAL) { - PERF_TIMER_GUARD(write_wal_time); - status = WriteToWAL(write_group, cur_log_writer, need_log_sync, - need_log_dir_sync, current_sequence); - if (log_used != nullptr) { - *log_used = logfile_number_; - } - } - - if (status.ok()) { - PERF_TIMER_GUARD(write_memtable_time); - - if (!parallel) { - status = WriteBatchInternal::InsertInto( - write_group, current_sequence, column_family_memtables_.get(), - &flush_scheduler_, write_options.ignore_missing_column_families, - 0 /*log_number*/, this); - - if (status.ok()) { - // There were no write failures. Set leader's status - // in case the write callback returned a non-ok status. - status = w.FinalStatus(); - } - - } else { - WriteThread::ParallelGroup pg; - pg.leader = &w; - pg.last_writer = last_writer; - pg.last_sequence = last_sequence; - pg.early_exit_allowed = !need_log_sync; - pg.running.store(static_cast(write_group.size()), - std::memory_order_relaxed); - write_thread_.LaunchParallelFollowers(&pg, current_sequence); - - if (w.ShouldWriteToMemtable()) { - // do leader write - ColumnFamilyMemTablesImpl column_family_memtables( - versions_->GetColumnFamilySet()); - assert(w.sequence == current_sequence); - WriteBatchInternal::SetSequence(w.batch, w.sequence); - w.status = WriteBatchInternal::InsertInto( - &w, &column_family_memtables, &flush_scheduler_, - write_options.ignore_missing_column_families, 0 /*log_number*/, - this, true /*concurrent_memtable_writes*/); - } - - // CompleteParallelWorker returns true if this thread should - // handle exit, false means somebody else did - exit_completed_early = !write_thread_.CompleteParallelWorker(&w); - status = w.FinalStatus(); - } - - if (!exit_completed_early && w.status.ok()) { - versions_->SetLastSequence(last_sequence); - if (!need_log_sync) { - write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status); - exit_completed_early = true; - } - } - - // A non-OK status here indicates that the state implied by the - // WAL has diverged from the in-memory state. This could be - // because of a corrupt write_batch (very bad), or because the - // client specified an invalid column family and didn't specify - // ignore_missing_column_families. - // - // Is setting bg_error_ enough here? This will at least stop - // compaction and fail any further writes. - if (!status.ok() && bg_error_.ok() && !w.CallbackFailed()) { - bg_error_ = status; - } - } - } - PERF_TIMER_START(write_pre_and_post_process_time); - - if (immutable_db_options_.paranoid_checks && !status.ok() && - !w.CallbackFailed() && !status.IsBusy() && !status.IsIncomplete()) { - mutex_.Lock(); - if (bg_error_.ok()) { - bg_error_ = status; // stop compaction & fail any further writes - } - mutex_.Unlock(); - } - - if (logs_getting_synced) { - mutex_.Lock(); - MarkLogsSynced(logfile_number_, need_log_dir_sync, status); - mutex_.Unlock(); - } - - if (!exit_completed_early) { - write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status); - } - - return status; -} - -Status DBImpl::PreprocessWrite(const WriteOptions& write_options, - bool need_log_sync, bool* logs_getting_synced, - WriteContext* write_context) { - mutex_.AssertHeld(); - assert(write_context != nullptr && logs_getting_synced != nullptr); - Status status; - - assert(!single_column_family_mode_ || - versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1); - if (UNLIKELY(status.ok() && !single_column_family_mode_ && - total_log_size_ > GetMaxTotalWalSize())) { - status = HandleWALFull(write_context); - } - - if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) { - // Before a new memtable is added in SwitchMemtable(), - // write_buffer_manager_->ShouldFlush() will keep returning true. If another - // thread is writing to another DB with the same write buffer, they may also - // be flushed. We may end up with flushing much more DBs than needed. It's - // suboptimal but still correct. - status = HandleWriteBufferFull(write_context); - } - - if (UNLIKELY(status.ok() && !bg_error_.ok())) { - status = bg_error_; - } - - if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { - status = ScheduleFlushes(write_context); - } - - if (UNLIKELY(status.ok() && (write_controller_.IsStopped() || - write_controller_.NeedsDelay()))) { - PERF_TIMER_GUARD(write_delay_time); - // We don't know size of curent batch so that we always use the size - // for previous one. It might create a fairness issue that expiration - // might happen for smaller writes but larger writes can go through. - // Can optimize it if it is an issue. - status = DelayWrite(last_batch_group_size_, write_options); - } - - if (status.ok() && need_log_sync) { - while (logs_.front().getting_synced) { - log_sync_cv_.Wait(); - } - for (auto& log : logs_) { - assert(!log.getting_synced); - log.getting_synced = true; - } - *logs_getting_synced = true; - } - - return status; -} - -Status DBImpl::WriteToWAL(const autovector& write_group, - log::Writer* log_writer, bool need_log_sync, - bool need_log_dir_sync, SequenceNumber sequence) { - Status status; - - WriteBatch* merged_batch = nullptr; - size_t write_with_wal = 0; - if (write_group.size() == 1 && write_group[0]->ShouldWriteToWAL() && - write_group[0]->batch->GetWalTerminationPoint().is_cleared()) { - // we simply write the first WriteBatch to WAL if the group only - // contains one batch, that batch should be written to the WAL, - // and the batch is not wanting to be truncated - merged_batch = write_group[0]->batch; - write_group[0]->log_used = logfile_number_; - write_with_wal = 1; - } else { - // WAL needs all of the batches flattened into a single batch. - // We could avoid copying here with an iov-like AddRecord - // interface - merged_batch = &tmp_batch_; - for (auto writer : write_group) { - if (writer->ShouldWriteToWAL()) { - WriteBatchInternal::Append(merged_batch, writer->batch, - /*WAL_only*/ true); - write_with_wal++; - } - writer->log_used = logfile_number_; - } - } - - WriteBatchInternal::SetSequence(merged_batch, sequence); - - Slice log_entry = WriteBatchInternal::Contents(merged_batch); - status = log_writer->AddRecord(log_entry); - total_log_size_ += log_entry.size(); - alive_log_files_.back().AddSize(log_entry.size()); - log_empty_ = false; - uint64_t log_size = log_entry.size(); - - if (status.ok() && need_log_sync) { - StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); - // It's safe to access logs_ with unlocked mutex_ here because: - // - we've set getting_synced=true for all logs, - // so other threads won't pop from logs_ while we're here, - // - only writer thread can push to logs_, and we're in - // writer thread, so no one will push to logs_, - // - as long as other threads don't modify it, it's safe to read - // from std::deque from multiple threads concurrently. - for (auto& log : logs_) { - status = log.writer->file()->Sync(immutable_db_options_.use_fsync); - if (!status.ok()) { - break; - } - } - if (status.ok() && need_log_dir_sync) { - // We only sync WAL directory the first time WAL syncing is - // requested, so that in case users never turn on WAL sync, - // we can avoid the disk I/O in the write code path. - status = directories_.GetWalDir()->Fsync(); - } - } - - if (merged_batch == &tmp_batch_) { - tmp_batch_.Clear(); - } - if (status.ok()) { - auto stats = default_cf_internal_stats_; - if (need_log_sync) { - stats->AddDBStats(InternalStats::WAL_FILE_SYNCED, 1); - RecordTick(stats_, WAL_FILE_SYNCED); - } - stats->AddDBStats(InternalStats::WAL_FILE_BYTES, log_size); - RecordTick(stats_, WAL_FILE_BYTES, log_size); - stats->AddDBStats(InternalStats::WRITE_WITH_WAL, write_with_wal); - RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); - } - return status; -} - -Status DBImpl::HandleWALFull(WriteContext* write_context) { - mutex_.AssertHeld(); - assert(write_context != nullptr); - Status status; - - if (alive_log_files_.begin()->getting_flushed) { - return status; - } - - auto oldest_alive_log = alive_log_files_.begin()->number; - auto oldest_log_with_uncommited_prep = FindMinLogContainingOutstandingPrep(); - - if (allow_2pc() && - oldest_log_with_uncommited_prep > 0 && - oldest_log_with_uncommited_prep <= oldest_alive_log) { - if (unable_to_flush_oldest_log_) { - // we already attempted to flush all column families dependent on - // the oldest alive log but the log still contained uncommited transactions. - // the oldest alive log STILL contains uncommited transaction so there - // is still nothing that we can do. - return status; - } else { - ROCKS_LOG_WARN( - immutable_db_options_.info_log, - "Unable to release oldest log due to uncommited transaction"); - unable_to_flush_oldest_log_ = true; - } - } else { - // we only mark this log as getting flushed if we have successfully - // flushed all data in this log. If this log contains outstanding prepared - // transactions then we cannot flush this log until those transactions are commited. - unable_to_flush_oldest_log_ = false; - alive_log_files_.begin()->getting_flushed = true; - } + // ArenaWrappedDBIter inlines an arena area where all the iterators in + // the iterator tree are allocated in the order of being accessed when + // querying. + // Laying out the iterators in the order of being accessed makes it more + // likely that any iterator pointer is close to the iterator it points to so + // that they are likely to be in the same cache line and/or page. + ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( + env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_options.iterate_upper_bound, + read_options.prefix_same_as_start, read_options.pin_data, + read_options.total_order_seek, + read_options.max_skippable_internal_keys); - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "Flushing all column families with data in WAL number %" PRIu64 - ". Total log size is %" PRIu64 - " while max_total_wal_size is %" PRIu64, - oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize()); - // no need to refcount because drop is happening in write thread, so can't - // happen while we're in the write thread - for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->IsDropped()) { - continue; - } - if (cfd->OldestLogToKeep() <= oldest_alive_log) { - status = SwitchMemtable(cfd, write_context); - if (!status.ok()) { - break; - } - cfd->imm()->FlushRequested(); - SchedulePendingFlush(cfd); - } + InternalIterator* internal_iter = + NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(), + db_iter->GetRangeDelAggregator()); + db_iter->SetIterUnderDBIter(internal_iter); + + return db_iter; } - MaybeScheduleFlushOrCompaction(); - return status; + // To stop compiler from complaining + return nullptr; } -Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) { - mutex_.AssertHeld(); - assert(write_context != nullptr); - Status status; - - // Before a new memtable is added in SwitchMemtable(), - // write_buffer_manager_->ShouldFlush() will keep returning true. If another - // thread is writing to another DB with the same write buffer, they may also - // be flushed. We may end up with flushing much more DBs than needed. It's - // suboptimal but still correct. - ROCKS_LOG_INFO( - immutable_db_options_.info_log, - "Flushing column family with largest mem table size. Write buffer is " - "using %" PRIu64 " bytes out of a total of %" PRIu64 ".", - write_buffer_manager_->memory_usage(), - write_buffer_manager_->buffer_size()); - // no need to refcount because drop is happening in write thread, so can't - // happen while we're in the write thread - ColumnFamilyData* cfd_picked = nullptr; - SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber; - - for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->IsDropped()) { - continue; +Status DBImpl::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + if (read_options.read_tier == kPersistedTier) { + return Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators."); + } + iterators->clear(); + iterators->reserve(column_families.size()); + if (read_options.managed) { +#ifdef ROCKSDB_LITE + return Status::InvalidArgument( + "Managed interator not supported in RocksDB lite"); +#else + if ((!read_options.tailing) && (read_options.snapshot == nullptr) && + (!is_snapshot_supported_)) { + return Status::InvalidArgument( + "Managed interator not supported without snapshots"); } - if (!cfd->mem()->IsEmpty()) { - // We only consider active mem table, hoping immutable memtable is - // already in the process of flushing. - uint64_t seq = cfd->mem()->GetCreationSeq(); - if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) { - cfd_picked = cfd; - seq_num_for_cf_picked = seq; - } + for (auto cfh : column_families) { + auto cfd = reinterpret_cast(cfh)->cfd(); + auto iter = new ManagedIterator(this, read_options, cfd); + iterators->push_back(iter); } - } - if (cfd_picked != nullptr) { - status = SwitchMemtable(cfd_picked, write_context); - if (status.ok()) { - cfd_picked->imm()->FlushRequested(); - SchedulePendingFlush(cfd_picked); - MaybeScheduleFlushOrCompaction(); +#endif + } else if (read_options.tailing) { +#ifdef ROCKSDB_LITE + return Status::InvalidArgument( + "Tailing interator not supported in RocksDB lite"); +#else + for (auto cfh : column_families) { + auto cfd = reinterpret_cast(cfh)->cfd(); + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); + auto iter = new ForwardIterator(this, read_options, cfd, sv); + iterators->push_back(NewDBIterator( + env_, *cfd->ioptions(), cfd->user_comparator(), iter, + kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, nullptr, false, read_options.pin_data, + read_options.total_order_seek, + read_options.max_skippable_internal_keys)); } - } - return status; -} - -uint64_t DBImpl::GetMaxTotalWalSize() const { - mutex_.AssertHeld(); - return mutable_db_options_.max_total_wal_size == 0 - ? 4 * max_total_in_memory_state_ - : mutable_db_options_.max_total_wal_size; -} - -// REQUIRES: mutex_ is held -// REQUIRES: this thread is currently at the front of the writer queue -Status DBImpl::DelayWrite(uint64_t num_bytes, - const WriteOptions& write_options) { - uint64_t time_delayed = 0; - bool delayed = false; - { - StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed); - uint64_t delay = write_controller_.GetDelay(env_, num_bytes); - if (delay > 0) { - if (write_options.no_slowdown) { - return Status::Incomplete(); - } - TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep"); +#endif + } else { + SequenceNumber latest_snapshot = versions_->LastSequence(); - mutex_.Unlock(); - // We will delay the write until we have slept for delay ms or - // we don't need a delay anymore - const uint64_t kDelayInterval = 1000; - uint64_t stall_end = sw.start_time() + delay; - while (write_controller_.NeedsDelay()) { - if (env_->NowMicros() >= stall_end) { - // We already delayed this write `delay` microseconds - break; - } + for (size_t i = 0; i < column_families.size(); ++i) { + auto* cfd = reinterpret_cast( + column_families[i])->cfd(); + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); - delayed = true; - // Sleep for 0.001 seconds - env_->SleepForMicroseconds(kDelayInterval); - } - mutex_.Lock(); - } + auto snapshot = + read_options.snapshot != nullptr + ? reinterpret_cast( + read_options.snapshot)->number_ + : latest_snapshot; - while (bg_error_.ok() && write_controller_.IsStopped()) { - if (write_options.no_slowdown) { - return Status::Incomplete(); - } - delayed = true; - TEST_SYNC_POINT("DBImpl::DelayWrite:Wait"); - bg_cv_.Wait(); + ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( + env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, nullptr, false, read_options.pin_data, + read_options.total_order_seek, + read_options.max_skippable_internal_keys); + InternalIterator* internal_iter = + NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(), + db_iter->GetRangeDelAggregator()); + db_iter->SetIterUnderDBIter(internal_iter); + iterators->push_back(db_iter); } } - assert(!delayed || !write_options.no_slowdown); - if (delayed) { - default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_STALL_MICROS, - time_delayed); - RecordTick(stats_, STALL_MICROS, time_delayed); - } - - return bg_error_; -} -Status DBImpl::ScheduleFlushes(WriteContext* context) { - ColumnFamilyData* cfd; - while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { - auto status = SwitchMemtable(cfd, context); - if (cfd->Unref()) { - delete cfd; - } - if (!status.ok()) { - return status; - } - } return Status::OK(); } -#ifndef ROCKSDB_LITE -void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* cfd, - const MemTableInfo& mem_table_info) { - if (immutable_db_options_.listeners.size() == 0U) { - return; - } - if (shutting_down_.load(std::memory_order_acquire)) { - return; - } +const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); } - for (auto listener : immutable_db_options_.listeners) { - listener->OnMemTableSealed(mem_table_info); - } +#ifndef ROCKSDB_LITE +const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() { + return GetSnapshotImpl(true); } #endif // ROCKSDB_LITE -// REQUIRES: mutex_ is held -// REQUIRES: this thread is currently at the front of the writer queue -Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { - mutex_.AssertHeld(); - unique_ptr lfile; - log::Writer* new_log = nullptr; - MemTable* new_mem = nullptr; - - // Attempt to switch to a new memtable and trigger flush of old. - // Do this without holding the dbmutex lock. - assert(versions_->prev_log_number() == 0); - bool creating_new_log = !log_empty_; - uint64_t recycle_log_number = 0; - if (creating_new_log && immutable_db_options_.recycle_log_file_num && - !log_recycle_files.empty()) { - recycle_log_number = log_recycle_files.front(); - log_recycle_files.pop_front(); +const Snapshot* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary) { + int64_t unix_time = 0; + env_->GetCurrentTime(&unix_time); // Ignore error + SnapshotImpl* s = new SnapshotImpl; + + InstrumentedMutexLock l(&mutex_); + // returns null if the underlying memtable does not support snapshot. + if (!is_snapshot_supported_) { + delete s; + return nullptr; } - uint64_t new_log_number = - creating_new_log ? versions_->NewFileNumber() : logfile_number_; - SuperVersion* new_superversion = nullptr; - const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + return snapshots_.New(s, versions_->LastSequence(), unix_time, + is_write_conflict_boundary); +} - // Set current_memtble_info for memtable sealed callback -#ifndef ROCKSDB_LITE - MemTableInfo memtable_info; - memtable_info.cf_name = cfd->GetName(); - memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber(); - memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber(); - memtable_info.num_entries = cfd->mem()->num_entries(); - memtable_info.num_deletes = cfd->mem()->num_deletes(); -#endif // ROCKSDB_LITE - // Log this later after lock release. It may be outdated, e.g., if background - // flush happens before logging, but that should be ok. - int num_imm_unflushed = cfd->imm()->NumNotFlushed(); - DBOptions db_options = - BuildDBOptions(immutable_db_options_, mutable_db_options_); - const auto preallocate_block_size = - GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size); - mutex_.Unlock(); - Status s; +void DBImpl::ReleaseSnapshot(const Snapshot* s) { + const SnapshotImpl* casted_s = reinterpret_cast(s); { - if (creating_new_log) { - EnvOptions opt_env_opt = - env_->OptimizeForLogWrite(env_options_, db_options); - if (recycle_log_number) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "reusing log %" PRIu64 " from recycle list\n", - recycle_log_number); - s = env_->ReuseWritableFile( - LogFileName(immutable_db_options_.wal_dir, new_log_number), - LogFileName(immutable_db_options_.wal_dir, recycle_log_number), - &lfile, opt_env_opt); - } else { - s = NewWritableFile( - env_, LogFileName(immutable_db_options_.wal_dir, new_log_number), - &lfile, opt_env_opt); - } - if (s.ok()) { - // Our final size should be less than write_buffer_size - // (compression, etc) but err on the side of caution. - - // use preallocate_block_size instead - // of calling GetWalPreallocateBlockSize() - lfile->SetPreallocationBlockSize(preallocate_block_size); - unique_ptr file_writer( - new WritableFileWriter(std::move(lfile), opt_env_opt)); - new_log = - new log::Writer(std::move(file_writer), new_log_number, - immutable_db_options_.recycle_log_file_num > 0); - } - } - - if (s.ok()) { - SequenceNumber seq = versions_->LastSequence(); - new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq); - new_superversion = new SuperVersion(); - } - -#ifndef ROCKSDB_LITE - // PLEASE NOTE: We assume that there are no failable operations - // after lock is acquired below since we are already notifying - // client about mem table becoming immutable. - NotifyOnMemTableSealed(cfd, memtable_info); -#endif //ROCKSDB_LITE - } - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "[%s] New memtable created with log file: #%" PRIu64 - ". Immutable memtables: %d.\n", - cfd->GetName().c_str(), new_log_number, num_imm_unflushed); - mutex_.Lock(); - if (!s.ok()) { - // how do we fail if we're not creating new log? - assert(creating_new_log); - assert(!new_mem); - assert(!new_log); - return s; - } - if (creating_new_log) { - logfile_number_ = new_log_number; - assert(new_log != nullptr); - log_empty_ = true; - log_dir_synced_ = false; - logs_.emplace_back(logfile_number_, new_log); - alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); - } - for (auto loop_cfd : *versions_->GetColumnFamilySet()) { - // all this is just optimization to delete logs that - // are no longer needed -- if CF is empty, that means it - // doesn't need that particular log to stay alive, so we just - // advance the log number. no need to persist this in the manifest - if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 && - loop_cfd->imm()->NumNotFlushed() == 0) { - if (creating_new_log) { - loop_cfd->SetLogNumber(logfile_number_); - } - loop_cfd->mem()->SetCreationSeq(versions_->LastSequence()); - } + InstrumentedMutexLock l(&mutex_); + snapshots_.Delete(casted_s); } - - cfd->mem()->SetNextLogNumber(logfile_number_); - cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_); - new_mem->Ref(); - cfd->SetMemtable(new_mem); - context->superversions_to_free_.push_back(InstallSuperVersionAndScheduleWork( - cfd, new_superversion, mutable_cf_options)); - return s; + delete casted_s; } #ifndef ROCKSDB_LITE @@ -5927,47 +2097,6 @@ Status DBImpl::GetDbIdentity(std::string& identity) const { return s; } -// Default implementations of convenience methods that subclasses of DB -// can call if they wish -Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, - const Slice& key, const Slice& value) { - // Pre-allocate size of write batch conservatively. - // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, - // and we allocate 11 extra bytes for key length, as well as value length. - WriteBatch batch(key.size() + value.size() + 24); - batch.Put(column_family, key, value); - return Write(opt, &batch); -} - -Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, - const Slice& key) { - WriteBatch batch; - batch.Delete(column_family, key); - return Write(opt, &batch); -} - -Status DB::SingleDelete(const WriteOptions& opt, - ColumnFamilyHandle* column_family, const Slice& key) { - WriteBatch batch; - batch.SingleDelete(column_family, key); - return Write(opt, &batch); -} - -Status DB::DeleteRange(const WriteOptions& opt, - ColumnFamilyHandle* column_family, - const Slice& begin_key, const Slice& end_key) { - WriteBatch batch; - batch.DeleteRange(column_family, begin_key, end_key); - return Write(opt, &batch); -} - -Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, - const Slice& key, const Slice& value) { - WriteBatch batch; - batch.Merge(column_family, key, value); - return Write(opt, &batch); -} - // Default implementation -- returns not supported status Status DB::CreateColumnFamily(const ColumnFamilyOptions& cf_options, const std::string& column_family_name, @@ -5984,219 +2113,6 @@ Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) { DB::~DB() { } -Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { - DBOptions db_options(options); - ColumnFamilyOptions cf_options(options); - std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); - std::vector handles; - Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); - if (s.ok()) { - assert(handles.size() == 1); - // i can delete the handle since DBImpl is always holding a reference to - // default column family - delete handles[0]; - } - return s; -} - -Status DB::Open(const DBOptions& db_options, const std::string& dbname, - const std::vector& column_families, - std::vector* handles, DB** dbptr) { - Status s = SanitizeOptionsByTable(db_options, column_families); - if (!s.ok()) { - return s; - } - - s = ValidateOptions(db_options, column_families); - if (!s.ok()) { - return s; - } - - *dbptr = nullptr; - handles->clear(); - - size_t max_write_buffer_size = 0; - for (auto cf : column_families) { - max_write_buffer_size = - std::max(max_write_buffer_size, cf.options.write_buffer_size); - } - - DBImpl* impl = new DBImpl(db_options, dbname); - s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.wal_dir); - if (s.ok()) { - for (auto db_path : impl->immutable_db_options_.db_paths) { - s = impl->env_->CreateDirIfMissing(db_path.path); - if (!s.ok()) { - break; - } - } - } - - if (!s.ok()) { - delete impl; - return s; - } - - s = impl->CreateArchivalDirectory(); - if (!s.ok()) { - delete impl; - return s; - } - impl->mutex_.Lock(); - // Handles create_if_missing, error_if_exists - s = impl->Recover(column_families); - if (s.ok()) { - uint64_t new_log_number = impl->versions_->NewFileNumber(); - unique_ptr lfile; - EnvOptions soptions(db_options); - EnvOptions opt_env_options = - impl->immutable_db_options_.env->OptimizeForLogWrite( - soptions, BuildDBOptions(impl->immutable_db_options_, - impl->mutable_db_options_)); - s = NewWritableFile( - impl->immutable_db_options_.env, - LogFileName(impl->immutable_db_options_.wal_dir, new_log_number), - &lfile, opt_env_options); - if (s.ok()) { - lfile->SetPreallocationBlockSize( - impl->GetWalPreallocateBlockSize(max_write_buffer_size)); - impl->logfile_number_ = new_log_number; - unique_ptr file_writer( - new WritableFileWriter(std::move(lfile), opt_env_options)); - impl->logs_.emplace_back( - new_log_number, - new log::Writer( - std::move(file_writer), new_log_number, - impl->immutable_db_options_.recycle_log_file_num > 0)); - - // set column family handles - for (auto cf : column_families) { - auto cfd = - impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); - if (cfd != nullptr) { - handles->push_back( - new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); - impl->NewThreadStatusCfInfo(cfd); - } else { - if (db_options.create_missing_column_families) { - // missing column family, create it - ColumnFamilyHandle* handle; - impl->mutex_.Unlock(); - s = impl->CreateColumnFamily(cf.options, cf.name, &handle); - impl->mutex_.Lock(); - if (s.ok()) { - handles->push_back(handle); - } else { - break; - } - } else { - s = Status::InvalidArgument("Column family not found: ", cf.name); - break; - } - } - } - } - if (s.ok()) { - for (auto cfd : *impl->versions_->GetColumnFamilySet()) { - delete impl->InstallSuperVersionAndScheduleWork( - cfd, nullptr, *cfd->GetLatestMutableCFOptions()); - } - impl->alive_log_files_.push_back( - DBImpl::LogFileNumberSize(impl->logfile_number_)); - impl->DeleteObsoleteFiles(); - s = impl->directories_.GetDbDir()->Fsync(); - } - } - - if (s.ok()) { - for (auto cfd : *impl->versions_->GetColumnFamilySet()) { - if (cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { - auto* vstorage = cfd->current()->storage_info(); - for (int i = 1; i < vstorage->num_levels(); ++i) { - int num_files = vstorage->NumLevelFiles(i); - if (num_files > 0) { - s = Status::InvalidArgument( - "Not all files are at level 0. Cannot " - "open with FIFO compaction style."); - break; - } - } - } - if (!cfd->mem()->IsSnapshotSupported()) { - impl->is_snapshot_supported_ = false; - } - if (cfd->ioptions()->merge_operator != nullptr && - !cfd->mem()->IsMergeOperatorSupported()) { - s = Status::InvalidArgument( - "The memtable of column family %s does not support merge operator " - "its options.merge_operator is non-null", cfd->GetName().c_str()); - } - if (!s.ok()) { - break; - } - } - } - TEST_SYNC_POINT("DBImpl::Open:Opened"); - Status persist_options_status; - if (s.ok()) { - // Persist RocksDB Options before scheduling the compaction. - // The WriteOptionsFile() will release and lock the mutex internally. - persist_options_status = impl->WriteOptionsFile(); - - *dbptr = impl; - impl->opened_successfully_ = true; - impl->MaybeScheduleFlushOrCompaction(); - } - impl->mutex_.Unlock(); - -#ifndef ROCKSDB_LITE - auto sfm = static_cast( - impl->immutable_db_options_.sst_file_manager.get()); - if (s.ok() && sfm) { - // Notify SstFileManager about all sst files that already exist in - // db_paths[0] when the DB is opened. - auto& db_path = impl->immutable_db_options_.db_paths[0]; - std::vector existing_files; - impl->immutable_db_options_.env->GetChildren(db_path.path, &existing_files); - for (auto& file_name : existing_files) { - uint64_t file_number; - FileType file_type; - std::string file_path = db_path.path + "/" + file_name; - if (ParseFileName(file_name, &file_number, &file_type) && - file_type == kTableFile) { - sfm->OnAddFile(file_path); - } - } - } -#endif // !ROCKSDB_LITE - - if (s.ok()) { - ROCKS_LOG_INFO(impl->immutable_db_options_.info_log, "DB pointer %p", impl); - LogFlush(impl->immutable_db_options_.info_log); - if (!persist_options_status.ok()) { - if (db_options.fail_if_options_file_error) { - s = Status::IOError( - "DB::Open() failed --- Unable to persist Options file", - persist_options_status.ToString()); - } - ROCKS_LOG_WARN(impl->immutable_db_options_.info_log, - "Unable to persist options in DB::Open() -- %s", - persist_options_status.ToString().c_str()); - } - } - if (!s.ok()) { - for (auto* h : *handles) { - delete h; - } - handles->clear(); - delete impl; - *dbptr = nullptr; - } - return s; -} - Status DB::ListColumnFamilies(const DBOptions& db_options, const std::string& name, std::vector* column_families) { diff --git a/db/db_impl.h b/db/db_impl.h index 2be81374e..84f0ce0c6 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -38,6 +38,7 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" +#include "rocksdb/status.h" #include "rocksdb/transaction_log.h" #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" @@ -1149,6 +1150,10 @@ extern Options SanitizeOptions(const std::string& db, extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src); +extern CompressionType GetCompressionFlush( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options); + // Fix user-supplied options to be reasonable template static void ClipToRange(T* ptr, V minvalue, V maxvalue) { diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc new file mode 100644 index 000000000..6cf004a1a --- /dev/null +++ b/db/db_impl_compaction_flush.cc @@ -0,0 +1,1731 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif +#include + +#include "db/builder.h" +#include "util/iostats_context_imp.h" +#include "util/perf_context_imp.h" +#include "util/sst_file_manager_impl.h" +#include "util/sync_point.h" +#include "util/thread_status_updater.h" +#include "util/thread_status_util.h" + +namespace rocksdb { +Status DBImpl::SyncClosedLogs(JobContext* job_context) { + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start"); + mutex_.AssertHeld(); + autovector logs_to_sync; + uint64_t current_log_number = logfile_number_; + while (logs_.front().number < current_log_number && + logs_.front().getting_synced) { + log_sync_cv_.Wait(); + } + for (auto it = logs_.begin(); + it != logs_.end() && it->number < current_log_number; ++it) { + auto& log = *it; + assert(!log.getting_synced); + log.getting_synced = true; + logs_to_sync.push_back(log.writer); + } + + Status s; + if (!logs_to_sync.empty()) { + mutex_.Unlock(); + + for (log::Writer* log : logs_to_sync) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Syncing log #%" PRIu64, job_context->job_id, + log->get_log_number()); + s = log->file()->Sync(immutable_db_options_.use_fsync); + } + if (s.ok()) { + s = directories_.GetWalDir()->Fsync(); + } + + mutex_.Lock(); + + // "number <= current_log_number - 1" is equivalent to + // "number < current_log_number". + MarkLogsSynced(current_log_number - 1, true, s); + if (!s.ok()) { + bg_error_ = s; + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed"); + return s; + } + } + return s; +} + +Status DBImpl::FlushMemTableToOutputFile( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + bool* made_progress, JobContext* job_context, LogBuffer* log_buffer) { + mutex_.AssertHeld(); + assert(cfd->imm()->NumNotFlushed() != 0); + assert(cfd->imm()->IsFlushPending()); + + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + + FlushJob flush_job( + dbname_, cfd, immutable_db_options_, mutable_cf_options, env_options_, + versions_.get(), &mutex_, &shutting_down_, snapshot_seqs, + earliest_write_conflict_snapshot, job_context, log_buffer, + directories_.GetDbDir(), directories_.GetDataDir(0U), + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, + &event_logger_, mutable_cf_options.report_bg_io_stats); + + FileMetaData file_meta; + + flush_job.PickMemTable(); + + Status s; + if (logfile_number_ > 0 && + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 0) { + // If there are more than one column families, we need to make sure that + // all the log files except the most recent one are synced. Otherwise if + // the host crashes after flushing and before WAL is persistent, the + // flushed SST may contain data from write batches whose updates to + // other column families are missing. + // SyncClosedLogs() may unlock and re-lock the db_mutex. + s = SyncClosedLogs(job_context); + } + + // Within flush_job.Run, rocksdb may call event listener to notify + // file creation and deletion. + // + // Note that flush_job.Run will unlock and lock the db_mutex, + // and EventListener callback will be called when the db_mutex + // is unlocked by the current thread. + if (s.ok()) { + s = flush_job.Run(&file_meta); + } else { + flush_job.Cancel(); + } + + if (s.ok()) { + InstallSuperVersionAndScheduleWorkWrapper(cfd, job_context, + mutable_cf_options); + if (made_progress) { + *made_progress = 1; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n", + cfd->GetName().c_str(), + cfd->current()->storage_info()->LevelSummary(&tmp)); + } + + if (!s.ok() && !s.IsShutdownInProgress() && + immutable_db_options_.paranoid_checks && bg_error_.ok()) { + // if a bad error happened (not ShutdownInProgress) and paranoid_checks is + // true, mark DB read-only + bg_error_ = s; + } + if (s.ok()) { +#ifndef ROCKSDB_LITE + // may temporarily unlock and lock the mutex. + NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options, + job_context->job_id, flush_job.GetTableProperties()); + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm) { + // Notify sst_file_manager that a new file was added + std::string file_path = MakeTableFileName( + immutable_db_options_.db_paths[0].path, file_meta.fd.GetNumber()); + sfm->OnAddFile(file_path); + if (sfm->IsMaxAllowedSpaceReached() && bg_error_.ok()) { + bg_error_ = Status::IOError("Max allowed space was reached"); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", + &bg_error_); + } + } +#endif // ROCKSDB_LITE + } + return s; +} + +void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd, + FileMetaData* file_meta, + const MutableCFOptions& mutable_cf_options, + int job_id, TableProperties prop) { +#ifndef ROCKSDB_LITE + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + bool triggered_writes_slowdown = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_slowdown_writes_trigger); + bool triggered_writes_stop = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_stop_writes_trigger); + // release lock while notifying events + mutex_.Unlock(); + { + FlushJobInfo info; + info.cf_name = cfd->GetName(); + // TODO(yhchiang): make db_paths dynamic in case flush does not + // go to L0 in the future. + info.file_path = MakeTableFileName(immutable_db_options_.db_paths[0].path, + file_meta->fd.GetNumber()); + info.thread_id = env_->GetThreadID(); + info.job_id = job_id; + info.triggered_writes_slowdown = triggered_writes_slowdown; + info.triggered_writes_stop = triggered_writes_stop; + info.smallest_seqno = file_meta->smallest_seqno; + info.largest_seqno = file_meta->largest_seqno; + info.table_properties = prop; + for (auto listener : immutable_db_options_.listeners) { + listener->OnFlushCompleted(this, info); + } + } + mutex_.Lock(); + // no need to signal bg_cv_ as it will be signaled at the end of the + // flush process. +#endif // ROCKSDB_LITE +} + +Status DBImpl::CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { + if (options.target_path_id >= immutable_db_options_.db_paths.size()) { + return Status::InvalidArgument("Invalid target path ID"); + } + + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + bool exclusive = options.exclusive_manual_compaction; + + Status s = FlushMemTable(cfd, FlushOptions()); + if (!s.ok()) { + LogFlush(immutable_db_options_.info_log); + return s; + } + + int max_level_with_files = 0; + { + InstrumentedMutexLock l(&mutex_); + Version* base = cfd->current(); + for (int level = 1; level < base->storage_info()->num_non_empty_levels(); + level++) { + if (base->storage_info()->OverlapInLevel(level, begin, end)) { + max_level_with_files = level; + } + } + } + + int final_output_level = 0; + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal && + cfd->NumberLevels() > 1) { + // Always compact all files together. + s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, + cfd->NumberLevels() - 1, options.target_path_id, + begin, end, exclusive); + final_output_level = cfd->NumberLevels() - 1; + } else { + for (int level = 0; level <= max_level_with_files; level++) { + int output_level; + // in case the compaction is universal or if we're compacting the + // bottom-most level, the output level will be the same as input one. + // level 0 can never be the bottommost level (i.e. if all files are in + // level 0, we will compact to level 1) + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + output_level = level; + } else if (level == max_level_with_files && level > 0) { + if (options.bottommost_level_compaction == + BottommostLevelCompaction::kSkip) { + // Skip bottommost level compaction + continue; + } else if (options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter && + cfd->ioptions()->compaction_filter == nullptr && + cfd->ioptions()->compaction_filter_factory == nullptr) { + // Skip bottommost level compaction since we don't have a compaction + // filter + continue; + } + output_level = level; + } else { + output_level = level + 1; + if (cfd->ioptions()->compaction_style == kCompactionStyleLevel && + cfd->ioptions()->level_compaction_dynamic_level_bytes && + level == 0) { + output_level = ColumnFamilyData::kCompactToBaseLevel; + } + } + s = RunManualCompaction(cfd, level, output_level, options.target_path_id, + begin, end, exclusive); + if (!s.ok()) { + break; + } + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + final_output_level = cfd->NumberLevels() - 1; + } else if (output_level > final_output_level) { + final_output_level = output_level; + } + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); + } + } + if (!s.ok()) { + LogFlush(immutable_db_options_.info_log); + return s; + } + + if (options.change_level) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[RefitLevel] waiting for background threads to stop"); + s = PauseBackgroundWork(); + if (s.ok()) { + s = ReFitLevel(cfd, final_output_level, options.target_level); + } + ContinueBackgroundWork(); + } + LogFlush(immutable_db_options_.info_log); + + { + InstrumentedMutexLock l(&mutex_); + // an automatic compaction that has been scheduled might have been + // preempted by the manual compactions. Need to schedule it back. + MaybeScheduleFlushOrCompaction(); + } + + return s; +} + +Status DBImpl::CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, + const int output_level, const int output_path_id) { +#ifdef ROCKSDB_LITE + // not supported in lite version + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + if (column_family == nullptr) { + return Status::InvalidArgument("ColumnFamilyHandle must be non-null."); + } + + auto cfd = reinterpret_cast(column_family)->cfd(); + assert(cfd); + + Status s; + JobContext job_context(0, true); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + + // Perform CompactFiles + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); + { + InstrumentedMutexLock l(&mutex_); + + // This call will unlock/lock the mutex to wait for current running + // IngestExternalFile() calls to finish. + WaitForIngestFile(); + + s = CompactFilesImpl(compact_options, cfd, sv->current, + input_file_names, output_level, + output_path_id, &job_context, &log_buffer); + } + if (sv->Unref()) { + mutex_.Lock(); + sv->Cleanup(); + mutex_.Unlock(); + delete sv; + } + + // Find and delete obsolete files + { + InstrumentedMutexLock l(&mutex_); + // If !s.ok(), this means that Compaction failed. In that case, we want + // to delete all obsolete files we might have created and we force + // FindObsoleteFiles(). This is because job_context does not + // catch all created files if compaction failed. + FindObsoleteFiles(&job_context, !s.ok()); + } // release the mutex + + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + // Have to flush the info logs before bg_compaction_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + // no mutex is locked here. No need to Unlock() and Lock() here. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + } + + return s; +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +Status DBImpl::CompactFilesImpl( + const CompactionOptions& compact_options, ColumnFamilyData* cfd, + Version* version, const std::vector& input_file_names, + const int output_level, int output_path_id, JobContext* job_context, + LogBuffer* log_buffer) { + mutex_.AssertHeld(); + + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + + std::unordered_set input_set; + for (auto file_name : input_file_names) { + input_set.insert(TableFileNameToNumber(file_name)); + } + + ColumnFamilyMetaData cf_meta; + // TODO(yhchiang): can directly use version here if none of the + // following functions call is pluggable to external developers. + version->GetColumnFamilyMetaData(&cf_meta); + + if (output_path_id < 0) { + if (immutable_db_options_.db_paths.size() == 1U) { + output_path_id = 0; + } else { + return Status::NotSupported( + "Automatic output path selection is not " + "yet supported in CompactFiles()"); + } + } + + Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles( + &input_set, cf_meta, output_level); + if (!s.ok()) { + return s; + } + + std::vector input_files; + s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( + &input_files, &input_set, version->storage_info(), compact_options); + if (!s.ok()) { + return s; + } + + for (auto inputs : input_files) { + if (cfd->compaction_picker()->FilesInCompaction(inputs.files)) { + return Status::Aborted( + "Some of the necessary compaction input " + "files are already being compacted"); + } + } + + // At this point, CompactFiles will be run. + bg_compaction_scheduled_++; + + unique_ptr c; + assert(cfd->compaction_picker()); + c.reset(cfd->compaction_picker()->FormCompaction( + compact_options, input_files, output_level, version->storage_info(), + *cfd->GetLatestMutableCFOptions(), output_path_id)); + if (!c) { + return Status::Aborted("Another Level 0 compaction is running"); + } + c->SetInputVersion(version); + // deletion compaction currently not allowed in CompactFiles. + assert(!c->deletion_compaction()); + + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + + auto pending_outputs_inserted_elem = + CaptureCurrentFileNumberInPendingOutputs(); + + assert(is_snapshot_supported_ || snapshots_.empty()); + CompactionJob compaction_job( + job_context->job_id, c.get(), immutable_db_options_, env_options_, + versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(), + directories_.GetDataDir(c->output_path_id()), stats_, &mutex_, &bg_error_, + snapshot_seqs, earliest_write_conflict_snapshot, table_cache_, + &event_logger_, c->mutable_cf_options()->paranoid_file_checks, + c->mutable_cf_options()->report_bg_io_stats, dbname_, + nullptr); // Here we pass a nullptr for CompactionJobStats because + // CompactFiles does not trigger OnCompactionCompleted(), + // which is the only place where CompactionJobStats is + // returned. The idea of not triggering OnCompationCompleted() + // is that CompactFiles runs in the caller thread, so the user + // should always know when it completes. As a result, it makes + // less sense to notify the users something they should already + // know. + // + // In the future, if we would like to add CompactionJobStats + // support for CompactFiles, we should have CompactFiles API + // pass a pointer of CompactionJobStats as the out-value + // instead of using EventListener. + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here. + version->storage_info()->ComputeCompactionScore(*cfd->ioptions(), + *c->mutable_cf_options()); + + compaction_job.Prepare(); + + mutex_.Unlock(); + TEST_SYNC_POINT("CompactFilesImpl:0"); + TEST_SYNC_POINT("CompactFilesImpl:1"); + compaction_job.Run(); + TEST_SYNC_POINT("CompactFilesImpl:2"); + TEST_SYNC_POINT("CompactFilesImpl:3"); + mutex_.Lock(); + + Status status = compaction_job.Install(*c->mutable_cf_options()); + if (status.ok()) { + InstallSuperVersionAndScheduleWorkWrapper( + c->column_family_data(), job_context, *c->mutable_cf_options()); + } + c->ReleaseCompactionFiles(s); + + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + if (status.ok()) { + // Done + } else if (status.IsShutdownInProgress()) { + // Ignore compaction errors found during shutting down + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] [JOB %d] Compaction error: %s", + c->column_family_data()->GetName().c_str(), + job_context->job_id, status.ToString().c_str()); + if (immutable_db_options_.paranoid_checks && bg_error_.ok()) { + bg_error_ = status; + } + } + + c.reset(); + + bg_compaction_scheduled_--; + if (bg_compaction_scheduled_ == 0) { + bg_cv_.SignalAll(); + } + + return status; +} +#endif // ROCKSDB_LITE + +Status DBImpl::PauseBackgroundWork() { + InstrumentedMutexLock guard_lock(&mutex_); + bg_compaction_paused_++; + while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_ > 0) { + bg_cv_.Wait(); + } + bg_work_paused_++; + return Status::OK(); +} + +Status DBImpl::ContinueBackgroundWork() { + InstrumentedMutexLock guard_lock(&mutex_); + if (bg_work_paused_ == 0) { + return Status::InvalidArgument(); + } + assert(bg_work_paused_ > 0); + assert(bg_compaction_paused_ > 0); + bg_compaction_paused_--; + bg_work_paused_--; + // It's sufficient to check just bg_work_paused_ here since + // bg_work_paused_ is always no greater than bg_compaction_paused_ + if (bg_work_paused_ == 0) { + MaybeScheduleFlushOrCompaction(); + } + return Status::OK(); +} + +void DBImpl::NotifyOnCompactionCompleted( + ColumnFamilyData* cfd, Compaction *c, const Status &st, + const CompactionJobStats& compaction_job_stats, + const int job_id) { +#ifndef ROCKSDB_LITE + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + // release lock while notifying events + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex"); + { + CompactionJobInfo info; + info.cf_name = cfd->GetName(); + info.status = st; + info.thread_id = env_->GetThreadID(); + info.job_id = job_id; + info.base_input_level = c->start_level(); + info.output_level = c->output_level(); + info.stats = compaction_job_stats; + info.table_properties = c->GetOutputTableProperties(); + info.compaction_reason = c->compaction_reason(); + info.compression = c->output_compression(); + for (size_t i = 0; i < c->num_input_levels(); ++i) { + for (const auto fmd : *c->inputs(i)) { + auto fn = TableFileName(immutable_db_options_.db_paths, + fmd->fd.GetNumber(), fmd->fd.GetPathId()); + info.input_files.push_back(fn); + if (info.table_properties.count(fn) == 0) { + std::shared_ptr tp; + auto s = cfd->current()->GetTableProperties(&tp, fmd, &fn); + if (s.ok()) { + info.table_properties[fn] = tp; + } + } + } + } + for (const auto newf : c->edit()->GetNewFiles()) { + info.output_files.push_back(TableFileName(immutable_db_options_.db_paths, + newf.second.fd.GetNumber(), + newf.second.fd.GetPathId())); + } + for (auto listener : immutable_db_options_.listeners) { + listener->OnCompactionCompleted(this, info); + } + } + mutex_.Lock(); + // no need to signal bg_cv_ as it will be signaled at the end of the + // flush process. +#endif // ROCKSDB_LITE +} + +// REQUIREMENT: block all background work by calling PauseBackgroundWork() +// before calling this function +Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { + assert(level < cfd->NumberLevels()); + if (target_level >= cfd->NumberLevels()) { + return Status::InvalidArgument("Target level exceeds number of levels"); + } + + std::unique_ptr superversion_to_free; + std::unique_ptr new_superversion(new SuperVersion()); + + Status status; + + InstrumentedMutexLock guard_lock(&mutex_); + + // only allow one thread refitting + if (refitting_level_) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[ReFitLevel] another thread is refitting"); + return Status::NotSupported("another thread is refitting"); + } + refitting_level_ = true; + + const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + // move to a smaller level + int to_level = target_level; + if (target_level < 0) { + to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level); + } + + auto* vstorage = cfd->current()->storage_info(); + if (to_level > level) { + if (level == 0) { + return Status::NotSupported( + "Cannot change from level 0 to other levels."); + } + // Check levels are empty for a trivial move + for (int l = level + 1; l <= to_level; l++) { + if (vstorage->NumLevelFiles(l) > 0) { + return Status::NotSupported( + "Levels between source and target are not empty for a move."); + } + } + } + if (to_level != level) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Before refitting:\n%s", cfd->GetName().c_str(), + cfd->current()->DebugString().data()); + + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + for (const auto& f : vstorage->LevelFiles(level)) { + edit.DeleteFile(level, f->fd.GetNumber()); + edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->smallest_seqno, f->largest_seqno, + f->marked_for_compaction); + } + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), + edit.DebugString().data()); + + status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_, + directories_.GetDbDir()); + superversion_to_free.reset(InstallSuperVersionAndScheduleWork( + cfd, new_superversion.release(), mutable_cf_options)); + + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n", + cfd->GetName().c_str(), status.ToString().data()); + + if (status.ok()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] After refitting:\n%s", cfd->GetName().c_str(), + cfd->current()->DebugString().data()); + } + } + + refitting_level_ = false; + + return status; +} + +int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + return cfh->cfd()->NumberLevels(); +} + +int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) { + return 0; +} + +int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + InstrumentedMutexLock l(&mutex_); + return cfh->cfd()->GetSuperVersion()-> + mutable_cf_options.level0_stop_writes_trigger; +} + +Status DBImpl::Flush(const FlushOptions& flush_options, + ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + return FlushMemTable(cfh->cfd(), flush_options); +} + +Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, + int output_level, uint32_t output_path_id, + const Slice* begin, const Slice* end, + bool exclusive, bool disallow_trivial_move) { + assert(input_level == ColumnFamilyData::kCompactAllLevels || + input_level >= 0); + + InternalKey begin_storage, end_storage; + CompactionArg* ca; + + bool scheduled = false; + bool manual_conflict = false; + ManualCompaction manual; + manual.cfd = cfd; + manual.input_level = input_level; + manual.output_level = output_level; + manual.output_path_id = output_path_id; + manual.done = false; + manual.in_progress = false; + manual.incomplete = false; + manual.exclusive = exclusive; + manual.disallow_trivial_move = disallow_trivial_move; + // For universal compaction, we enforce every manual compaction to compact + // all files. + if (begin == nullptr || + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + manual.begin = nullptr; + } else { + begin_storage.SetMaxPossibleForUserKey(*begin); + manual.begin = &begin_storage; + } + if (end == nullptr || + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + manual.end = nullptr; + } else { + end_storage.SetMinPossibleForUserKey(*end); + manual.end = &end_storage; + } + + TEST_SYNC_POINT("DBImpl::RunManualCompaction:0"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction:1"); + InstrumentedMutexLock l(&mutex_); + + // When a manual compaction arrives, temporarily disable scheduling of + // non-manual compactions and wait until the number of scheduled compaction + // jobs drops to zero. This is needed to ensure that this manual compaction + // can compact any range of keys/files. + // + // HasPendingManualCompaction() is true when at least one thread is inside + // RunManualCompaction(), i.e. during that time no other compaction will + // get scheduled (see MaybeScheduleFlushOrCompaction). + // + // Note that the following loop doesn't stop more that one thread calling + // RunManualCompaction() from getting to the second while loop below. + // However, only one of them will actually schedule compaction, while + // others will wait on a condition variable until it completes. + + AddManualCompaction(&manual); + TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_); + if (exclusive) { + while (bg_compaction_scheduled_ > 0) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[%s] Manual compaction waiting for all other scheduled background " + "compactions to finish", + cfd->GetName().c_str()); + bg_cv_.Wait(); + } + } + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] Manual compaction starting", cfd->GetName().c_str()); + + // We don't check bg_error_ here, because if we get the error in compaction, + // the compaction will set manual.status to bg_error_ and set manual.done to + // true. + while (!manual.done) { + assert(HasPendingManualCompaction()); + manual_conflict = false; + if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) || + scheduled || + ((manual.manual_end = &manual.tmp_storage1)&&( + (manual.compaction = manual.cfd->CompactRange( + *manual.cfd->GetLatestMutableCFOptions(), manual.input_level, + manual.output_level, manual.output_path_id, manual.begin, + manual.end, &manual.manual_end, &manual_conflict)) == + nullptr) && + manual_conflict)) { + // exclusive manual compactions should not see a conflict during + // CompactRange + assert(!exclusive || !manual_conflict); + // Running either this or some other manual compaction + bg_cv_.Wait(); + if (scheduled && manual.incomplete == true) { + assert(!manual.in_progress); + scheduled = false; + manual.incomplete = false; + } + } else if (!scheduled) { + if (manual.compaction == nullptr) { + manual.done = true; + bg_cv_.SignalAll(); + continue; + } + ca = new CompactionArg; + ca->db = this; + ca->m = &manual; + manual.incomplete = false; + bg_compaction_scheduled_++; + env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, + &DBImpl::UnscheduleCallback); + scheduled = true; + } + } + + assert(!manual.in_progress); + assert(HasPendingManualCompaction()); + RemoveManualCompaction(&manual); + bg_cv_.SignalAll(); + return manual.status; +} + +Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_options, + bool writes_stopped) { + Status s; + { + WriteContext context; + InstrumentedMutexLock guard_lock(&mutex_); + + if (cfd->imm()->NumNotFlushed() == 0 && cfd->mem()->IsEmpty()) { + // Nothing to flush + return Status::OK(); + } + + WriteThread::Writer w; + if (!writes_stopped) { + write_thread_.EnterUnbatched(&w, &mutex_); + } + + // SwitchMemtable() will release and reacquire mutex + // during execution + s = SwitchMemtable(cfd, &context); + + if (!writes_stopped) { + write_thread_.ExitUnbatched(&w); + } + + cfd->imm()->FlushRequested(); + + // schedule flush + SchedulePendingFlush(cfd); + MaybeScheduleFlushOrCompaction(); + } + + if (s.ok() && flush_options.wait) { + // Wait until the compaction completes + s = WaitForFlushMemTable(cfd); + } + return s; +} + +Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) { + Status s; + // Wait until the compaction completes + InstrumentedMutexLock l(&mutex_); + while (cfd->imm()->NumNotFlushed() > 0 && bg_error_.ok()) { + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + if (cfd->IsDropped()) { + // FlushJob cannot flush a dropped CF, if we did not break here + // we will loop forever since cfd->imm()->NumNotFlushed() will never + // drop to zero + return Status::InvalidArgument("Cannot flush a dropped CF"); + } + bg_cv_.Wait(); + } + if (!bg_error_.ok()) { + s = bg_error_; + } + return s; +} + +Status DBImpl::EnableAutoCompaction( + const std::vector& column_family_handles) { + Status s; + for (auto cf_ptr : column_family_handles) { + Status status = + this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}}); + if (!status.ok()) { + s = status; + } + } + + return s; +} + +void DBImpl::MaybeScheduleFlushOrCompaction() { + mutex_.AssertHeld(); + if (!opened_successfully_) { + // Compaction may introduce data race to DB open + return; + } + if (bg_work_paused_ > 0) { + // we paused the background work + return; + } else if (shutting_down_.load(std::memory_order_acquire)) { + // DB is being deleted; no more background compactions + return; + } + + while (unscheduled_flushes_ > 0 && + bg_flush_scheduled_ < immutable_db_options_.max_background_flushes) { + unscheduled_flushes_--; + bg_flush_scheduled_++; + env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this); + } + + auto bg_compactions_allowed = BGCompactionsAllowed(); + + // special case -- if max_background_flushes == 0, then schedule flush on a + // compaction thread + if (immutable_db_options_.max_background_flushes == 0) { + while (unscheduled_flushes_ > 0 && + bg_flush_scheduled_ + bg_compaction_scheduled_ < + bg_compactions_allowed) { + unscheduled_flushes_--; + bg_flush_scheduled_++; + env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::LOW, this); + } + } + + if (bg_compaction_paused_ > 0) { + // we paused the background compaction + return; + } + + if (HasExclusiveManualCompaction()) { + // only manual compactions are allowed to run. don't schedule automatic + // compactions + return; + } + + while (bg_compaction_scheduled_ < bg_compactions_allowed && + unscheduled_compactions_ > 0) { + CompactionArg* ca = new CompactionArg; + ca->db = this; + ca->m = nullptr; + bg_compaction_scheduled_++; + unscheduled_compactions_--; + env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, + &DBImpl::UnscheduleCallback); + } +} + +int DBImpl::BGCompactionsAllowed() const { + mutex_.AssertHeld(); + if (write_controller_.NeedSpeedupCompaction()) { + return mutable_db_options_.max_background_compactions; + } else { + return mutable_db_options_.base_background_compactions; + } +} + +void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) { + assert(!cfd->pending_compaction()); + cfd->Ref(); + compaction_queue_.push_back(cfd); + cfd->set_pending_compaction(true); +} + +ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() { + assert(!compaction_queue_.empty()); + auto cfd = *compaction_queue_.begin(); + compaction_queue_.pop_front(); + assert(cfd->pending_compaction()); + cfd->set_pending_compaction(false); + return cfd; +} + +void DBImpl::AddToFlushQueue(ColumnFamilyData* cfd) { + assert(!cfd->pending_flush()); + cfd->Ref(); + flush_queue_.push_back(cfd); + cfd->set_pending_flush(true); +} + +ColumnFamilyData* DBImpl::PopFirstFromFlushQueue() { + assert(!flush_queue_.empty()); + auto cfd = *flush_queue_.begin(); + flush_queue_.pop_front(); + assert(cfd->pending_flush()); + cfd->set_pending_flush(false); + return cfd; +} + +void DBImpl::SchedulePendingFlush(ColumnFamilyData* cfd) { + if (!cfd->pending_flush() && cfd->imm()->IsFlushPending()) { + AddToFlushQueue(cfd); + ++unscheduled_flushes_; + } +} + +void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { + if (!cfd->pending_compaction() && cfd->NeedsCompaction()) { + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + } +} + +void DBImpl::SchedulePendingPurge(std::string fname, FileType type, + uint64_t number, uint32_t path_id, + int job_id) { + mutex_.AssertHeld(); + PurgeFileInfo file_info(fname, type, number, path_id, job_id); + purge_queue_.push_back(std::move(file_info)); +} + +void DBImpl::BGWorkFlush(void* db) { + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH); + TEST_SYNC_POINT("DBImpl::BGWorkFlush"); + reinterpret_cast(db)->BackgroundCallFlush(); + TEST_SYNC_POINT("DBImpl::BGWorkFlush:done"); +} + +void DBImpl::BGWorkCompaction(void* arg) { + CompactionArg ca = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW); + TEST_SYNC_POINT("DBImpl::BGWorkCompaction"); + reinterpret_cast(ca.db)->BackgroundCallCompaction(ca.m); +} + +void DBImpl::BGWorkPurge(void* db) { + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH); + TEST_SYNC_POINT("DBImpl::BGWorkPurge:start"); + reinterpret_cast(db)->BackgroundCallPurge(); + TEST_SYNC_POINT("DBImpl::BGWorkPurge:end"); +} + +void DBImpl::UnscheduleCallback(void* arg) { + CompactionArg ca = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); + if ((ca.m != nullptr) && (ca.m->compaction != nullptr)) { + delete ca.m->compaction; + } + TEST_SYNC_POINT("DBImpl::UnscheduleCallback"); +} + +Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, + LogBuffer* log_buffer) { + mutex_.AssertHeld(); + + Status status = bg_error_; + if (status.ok() && shutting_down_.load(std::memory_order_acquire)) { + status = Status::ShutdownInProgress(); + } + + if (!status.ok()) { + return status; + } + + ColumnFamilyData* cfd = nullptr; + while (!flush_queue_.empty()) { + // This cfd is already referenced + auto first_cfd = PopFirstFromFlushQueue(); + + if (first_cfd->IsDropped() || !first_cfd->imm()->IsFlushPending()) { + // can't flush this CF, try next one + if (first_cfd->Unref()) { + delete first_cfd; + } + continue; + } + + // found a flush! + cfd = first_cfd; + break; + } + + if (cfd != nullptr) { + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + ROCKS_LOG_BUFFER( + log_buffer, + "Calling FlushMemTableToOutputFile with column " + "family [%s], flush slots available %d, compaction slots allowed %d, " + "compaction slots scheduled %d", + cfd->GetName().c_str(), immutable_db_options_.max_background_flushes, + bg_flush_scheduled_, BGCompactionsAllowed() - bg_compaction_scheduled_); + status = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress, + job_context, log_buffer); + if (cfd->Unref()) { + delete cfd; + } + } + return status; +} + +void DBImpl::BackgroundCallFlush() { + bool made_progress = false; + JobContext job_context(next_job_id_.fetch_add(1), true); + assert(bg_flush_scheduled_); + + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:start"); + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + { + InstrumentedMutexLock l(&mutex_); + num_running_flushes_++; + + auto pending_outputs_inserted_elem = + CaptureCurrentFileNumberInPendingOutputs(); + + Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer); + if (!s.ok() && !s.IsShutdownInProgress()) { + // Wait a little bit before retrying background flush in + // case this is an environmental problem and we do not want to + // chew up resources for failed flushes for the duration of + // the problem. + uint64_t error_cnt = + default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Waiting after background flush error: %s" + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + log_buffer.FlushBufferToLog(); + LogFlush(immutable_db_options_.info_log); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // If flush failed, we want to delete all temporary files that we might have + // created. Thus, we force full scan in FindObsoleteFiles() + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress()); + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + // Have to flush the info logs before bg_flush_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); + } + + assert(num_running_flushes_ > 0); + num_running_flushes_--; + bg_flush_scheduled_--; + // See if there's more work to be done + MaybeScheduleFlushOrCompaction(); + bg_cv_.SignalAll(); + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + +void DBImpl::BackgroundCallCompaction(void* arg) { + bool made_progress = false; + ManualCompaction* m = reinterpret_cast(arg); + JobContext job_context(next_job_id_.fetch_add(1), true); + TEST_SYNC_POINT("BackgroundCallCompaction:0"); + MaybeDumpStats(); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + { + InstrumentedMutexLock l(&mutex_); + + // This call will unlock/lock the mutex to wait for current running + // IngestExternalFile() calls to finish. + WaitForIngestFile(); + + num_running_compactions_++; + + auto pending_outputs_inserted_elem = + CaptureCurrentFileNumberInPendingOutputs(); + + assert(bg_compaction_scheduled_); + Status s = + BackgroundCompaction(&made_progress, &job_context, &log_buffer, m); + TEST_SYNC_POINT("BackgroundCallCompaction:1"); + if (!s.ok() && !s.IsShutdownInProgress()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + uint64_t error_cnt = + default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + log_buffer.FlushBufferToLog(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Waiting after background compaction error: %s, " + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + LogFlush(immutable_db_options_.info_log); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // If compaction failed, we want to delete all temporary files that we might + // have created (they might not be all recorded in job_context in case of a + // failure). Thus, we force full scan in FindObsoleteFiles() + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress()); + + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + // Have to flush the info logs before bg_compaction_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); + } + + assert(num_running_compactions_ > 0); + num_running_compactions_--; + bg_compaction_scheduled_--; + + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + + // See if there's more work to be done + MaybeScheduleFlushOrCompaction(); + if (made_progress || bg_compaction_scheduled_ == 0 || + HasPendingManualCompaction()) { + // signal if + // * made_progress -- need to wakeup DelayWrite + // * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl + // * HasPendingManualCompaction -- need to wakeup RunManualCompaction + // If none of this is true, there is no need to signal since nobody is + // waiting for it + bg_cv_.SignalAll(); + } + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + +Status DBImpl::BackgroundCompaction(bool* made_progress, + JobContext* job_context, + LogBuffer* log_buffer, void* arg) { + ManualCompaction* manual_compaction = + reinterpret_cast(arg); + *made_progress = false; + mutex_.AssertHeld(); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start"); + + bool is_manual = (manual_compaction != nullptr); + + // (manual_compaction->in_progress == false); + bool trivial_move_disallowed = + is_manual && manual_compaction->disallow_trivial_move; + + CompactionJobStats compaction_job_stats; + Status status = bg_error_; + if (status.ok() && shutting_down_.load(std::memory_order_acquire)) { + status = Status::ShutdownInProgress(); + } + + if (!status.ok()) { + if (is_manual) { + manual_compaction->status = status; + manual_compaction->done = true; + manual_compaction->in_progress = false; + delete manual_compaction->compaction; + manual_compaction = nullptr; + } + return status; + } + + if (is_manual) { + // another thread cannot pick up the same work + manual_compaction->in_progress = true; + } + + unique_ptr c; + // InternalKey manual_end_storage; + // InternalKey* manual_end = &manual_end_storage; + if (is_manual) { + ManualCompaction* m = manual_compaction; + assert(m->in_progress); + c.reset(std::move(m->compaction)); + if (!c) { + m->done = true; + m->manual_end = nullptr; + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Manual compaction from level-%d from %s .. " + "%s; nothing to do\n", + m->cfd->GetName().c_str(), m->input_level, + (m->begin ? m->begin->DebugString().c_str() : "(begin)"), + (m->end ? m->end->DebugString().c_str() : "(end)")); + } else { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Manual compaction from level-%d to level-%d from %s .. " + "%s; will stop at %s\n", + m->cfd->GetName().c_str(), m->input_level, c->output_level(), + (m->begin ? m->begin->DebugString().c_str() : "(begin)"), + (m->end ? m->end->DebugString().c_str() : "(end)"), + ((m->done || m->manual_end == nullptr) + ? "(end)" + : m->manual_end->DebugString().c_str())); + } + } else if (!compaction_queue_.empty()) { + // cfd is referenced here + auto cfd = PopFirstFromCompactionQueue(); + // We unreference here because the following code will take a Ref() on + // this cfd if it is going to use it (Compaction class holds a + // reference). + // This will all happen under a mutex so we don't have to be afraid of + // somebody else deleting it. + if (cfd->Unref()) { + delete cfd; + // This was the last reference of the column family, so no need to + // compact. + return Status::OK(); + } + + if (HaveManualCompaction(cfd)) { + // Can't compact right now, but try again later + TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict"); + return Status::OK(); + } + + // Pick up latest mutable CF Options and use it throughout the + // compaction job + // Compaction makes a copy of the latest MutableCFOptions. It should be used + // throughout the compaction procedure to make sure consistency. It will + // eventually be installed into SuperVersion + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) { + // NOTE: try to avoid unnecessary copy of MutableCFOptions if + // compaction is not necessary. Need to make sure mutex is held + // until we make a copy in the following code + TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction"); + c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer)); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction"); + if (c != nullptr) { + // update statistics + MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION, + c->inputs(0)->size()); + // There are three things that can change compaction score: + // 1) When flush or compaction finish. This case is covered by + // InstallSuperVersionAndScheduleWork + // 2) When MutableCFOptions changes. This case is also covered by + // InstallSuperVersionAndScheduleWork, because this is when the new + // options take effect. + // 3) When we Pick a new compaction, we "remove" those files being + // compacted from the calculation, which then influences compaction + // score. Here we check if we need the new compaction even without the + // files that are currently being compacted. If we need another + // compaction, we might be able to execute it in parallel, so we add it + // to the queue and schedule a new thread. + if (cfd->NeedsCompaction()) { + // Yes, we need more compactions! + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + MaybeScheduleFlushOrCompaction(); + } + } + } + } + + if (!c) { + // Nothing to do + ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do"); + } else if (c->deletion_compaction()) { + // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old + // file if there is alive snapshot pointing to it + assert(c->num_input_files(1) == 0); + assert(c->level() == 0); + assert(c->column_family_data()->ioptions()->compaction_style == + kCompactionStyleFIFO); + + compaction_job_stats.num_input_files = c->num_input_files(0); + + for (const auto& f : *c->inputs(0)) { + c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); + } + status = versions_->LogAndApply(c->column_family_data(), + *c->mutable_cf_options(), c->edit(), + &mutex_, directories_.GetDbDir()); + InstallSuperVersionAndScheduleWorkWrapper( + c->column_family_data(), job_context, *c->mutable_cf_options()); + ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n", + c->column_family_data()->GetName().c_str(), + c->num_input_files(0)); + *made_progress = true; + } else if (!trivial_move_disallowed && c->IsTrivialMove()) { + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove"); + // Instrument for event update + // TODO(yhchiang): add op details for showing trivial-move. + ThreadStatusUtil::SetColumnFamily( + c->column_family_data(), c->column_family_data()->ioptions()->env, + immutable_db_options_.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + + compaction_job_stats.num_input_files = c->num_input_files(0); + + // Move files to next level + int32_t moved_files = 0; + int64_t moved_bytes = 0; + for (unsigned int l = 0; l < c->num_input_levels(); l++) { + if (c->level(l) == c->output_level()) { + continue; + } + for (size_t i = 0; i < c->num_input_files(l); i++) { + FileMetaData* f = c->input(l, i); + c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); + c->edit()->AddFile(c->output_level(), f->fd.GetNumber(), + f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, + f->largest, f->smallest_seqno, f->largest_seqno, + f->marked_for_compaction); + + ROCKS_LOG_BUFFER(log_buffer, "[%s] Moving #%" PRIu64 + " to level-%d %" PRIu64 " bytes\n", + c->column_family_data()->GetName().c_str(), + f->fd.GetNumber(), c->output_level(), + f->fd.GetFileSize()); + ++moved_files; + moved_bytes += f->fd.GetFileSize(); + } + } + + status = versions_->LogAndApply(c->column_family_data(), + *c->mutable_cf_options(), c->edit(), + &mutex_, directories_.GetDbDir()); + // Use latest MutableCFOptions + InstallSuperVersionAndScheduleWorkWrapper( + c->column_family_data(), job_context, *c->mutable_cf_options()); + + VersionStorageInfo::LevelSummaryStorage tmp; + c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(), + moved_bytes); + { + event_logger_.LogToBuffer(log_buffer) + << "job" << job_context->job_id << "event" + << "trivial_move" + << "destination_level" << c->output_level() << "files" << moved_files + << "total_files_size" << moved_bytes; + } + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n", + c->column_family_data()->GetName().c_str(), moved_files, + c->output_level(), moved_bytes, status.ToString().c_str(), + c->column_family_data()->current()->storage_info()->LevelSummary(&tmp)); + *made_progress = true; + + // Clear Instrument + ThreadStatusUtil::ResetThreadStatus(); + } else { + int output_level __attribute__((unused)) = c->output_level(); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial", + &output_level); + + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + + assert(is_snapshot_supported_ || snapshots_.empty()); + CompactionJob compaction_job( + job_context->job_id, c.get(), immutable_db_options_, env_options_, + versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(), + directories_.GetDataDir(c->output_path_id()), stats_, &mutex_, + &bg_error_, snapshot_seqs, earliest_write_conflict_snapshot, + table_cache_, &event_logger_, + c->mutable_cf_options()->paranoid_file_checks, + c->mutable_cf_options()->report_bg_io_stats, dbname_, + &compaction_job_stats); + compaction_job.Prepare(); + + mutex_.Unlock(); + compaction_job.Run(); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); + mutex_.Lock(); + + status = compaction_job.Install(*c->mutable_cf_options()); + if (status.ok()) { + InstallSuperVersionAndScheduleWorkWrapper( + c->column_family_data(), job_context, *c->mutable_cf_options()); + } + *made_progress = true; + } + if (c != nullptr) { + c->ReleaseCompactionFiles(status); + *made_progress = true; + NotifyOnCompactionCompleted( + c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + } + // this will unref its input_version and column_family_data + c.reset(); + + if (status.ok()) { + // Done + } else if (status.IsShutdownInProgress()) { + // Ignore compaction errors found during shutting down + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s", + status.ToString().c_str()); + if (immutable_db_options_.paranoid_checks && bg_error_.ok()) { + bg_error_ = status; + } + } + + if (is_manual) { + ManualCompaction* m = manual_compaction; + if (!status.ok()) { + m->status = status; + m->done = true; + } + // For universal compaction: + // Because universal compaction always happens at level 0, so one + // compaction will pick up all overlapped files. No files will be + // filtered out due to size limit and left for a successive compaction. + // So we can safely conclude the current compaction. + // + // Also note that, if we don't stop here, then the current compaction + // writes a new file back to level 0, which will be used in successive + // compaction. Hence the manual compaction will never finish. + // + // Stop the compaction if manual_end points to nullptr -- this means + // that we compacted the whole range. manual_end should always point + // to nullptr in case of universal compaction + if (m->manual_end == nullptr) { + m->done = true; + } + if (!m->done) { + // We only compacted part of the requested range. Update *m + // to the range that is left to be compacted. + // Universal and FIFO compactions should always compact the whole range + assert(m->cfd->ioptions()->compaction_style != + kCompactionStyleUniversal || + m->cfd->ioptions()->num_levels > 1); + assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO); + m->tmp_storage = *m->manual_end; + m->begin = &m->tmp_storage; + m->incomplete = true; + } + m->in_progress = false; // not being processed anymore + } + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish"); + return status; +} + +bool DBImpl::HasPendingManualCompaction() { + return (!manual_compaction_dequeue_.empty()); +} + +void DBImpl::AddManualCompaction(DBImpl::ManualCompaction* m) { + manual_compaction_dequeue_.push_back(m); +} + +void DBImpl::RemoveManualCompaction(DBImpl::ManualCompaction* m) { + // Remove from queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if (m == (*it)) { + it = manual_compaction_dequeue_.erase(it); + return; + } + it++; + } + assert(false); + return; +} + +bool DBImpl::ShouldntRunManualCompaction(ManualCompaction* m) { + if (num_running_ingest_file_ > 0) { + // We need to wait for other IngestExternalFile() calls to finish + // before running a manual compaction. + return true; + } + if (m->exclusive) { + return (bg_compaction_scheduled_ > 0); + } + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + bool seen = false; + while (it != manual_compaction_dequeue_.end()) { + if (m == (*it)) { + it++; + seen = true; + continue; + } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) { + // Consider the other manual compaction *it, conflicts if: + // overlaps with m + // and (*it) is ahead in the queue and is not yet in progress + return true; + } + it++; + } + return false; +} + +bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) { + // Remove from priority queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if ((*it)->exclusive) { + return true; + } + if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) { + // Allow automatic compaction if manual compaction is + // is in progress + return true; + } + it++; + } + return false; +} + +bool DBImpl::HasExclusiveManualCompaction() { + // Remove from priority queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if ((*it)->exclusive) { + return true; + } + it++; + } + return false; +} + +bool DBImpl::MCOverlap(ManualCompaction* m, ManualCompaction* m1) { + if ((m->exclusive) || (m1->exclusive)) { + return true; + } + if (m->cfd != m1->cfd) { + return false; + } + return true; +} + +// JobContext gets created and destructed outside of the lock -- +// we +// use this convinently to: +// * malloc one SuperVersion() outside of the lock -- new_superversion +// * delete SuperVersion()s outside of the lock -- superversions_to_free +// +// However, if InstallSuperVersionAndScheduleWork() gets called twice with the +// same job_context, we can't reuse the SuperVersion() that got +// malloced because +// first call already used it. In that rare case, we take a hit and create a +// new SuperVersion() inside of the mutex. We do similar thing +// for superversion_to_free +void DBImpl::InstallSuperVersionAndScheduleWorkWrapper( + ColumnFamilyData* cfd, JobContext* job_context, + const MutableCFOptions& mutable_cf_options) { + mutex_.AssertHeld(); + SuperVersion* old_superversion = InstallSuperVersionAndScheduleWork( + cfd, job_context->new_superversion, mutable_cf_options); + job_context->new_superversion = nullptr; + job_context->superversions_to_free.push_back(old_superversion); +} + +SuperVersion* DBImpl::InstallSuperVersionAndScheduleWork( + ColumnFamilyData* cfd, SuperVersion* new_sv, + const MutableCFOptions& mutable_cf_options) { + mutex_.AssertHeld(); + + // Update max_total_in_memory_state_ + size_t old_memtable_size = 0; + auto* old_sv = cfd->GetSuperVersion(); + if (old_sv) { + old_memtable_size = old_sv->mutable_cf_options.write_buffer_size * + old_sv->mutable_cf_options.max_write_buffer_number; + } + + auto* old = cfd->InstallSuperVersion( + new_sv ? new_sv : new SuperVersion(), &mutex_, mutable_cf_options); + + // Whenever we install new SuperVersion, we might need to issue new flushes or + // compactions. + SchedulePendingFlush(cfd); + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + + // Update max_total_in_memory_state_ + max_total_in_memory_state_ = + max_total_in_memory_state_ - old_memtable_size + + mutable_cf_options.write_buffer_size * + mutable_cf_options.max_write_buffer_number; + return old; +} +} // namespace rocksdb diff --git a/db/db_impl_files.cc b/db/db_impl_files.cc new file mode 100644 index 000000000..f24dd97c4 --- /dev/null +++ b/db/db_impl_files.cc @@ -0,0 +1,535 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif +#include +#include "db/event_helpers.h" +#include "util/file_util.h" +#include "util/sst_file_manager_impl.h" + + +namespace rocksdb { +uint64_t DBImpl::FindMinPrepLogReferencedByMemTable() { + if (!allow_2pc()) { + return 0; + } + + uint64_t min_log = 0; + + // we must look through the memtables for two phase transactions + // that have been committed but not yet flushed + for (auto loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd->IsDropped()) { + continue; + } + + auto log = loop_cfd->imm()->GetMinLogContainingPrepSection(); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + + log = loop_cfd->mem()->GetMinLogContainingPrepSection(); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +void DBImpl::MarkLogAsHavingPrepSectionFlushed(uint64_t log) { + assert(log != 0); + std::lock_guard lock(prep_heap_mutex_); + auto it = prepared_section_completed_.find(log); + assert(it != prepared_section_completed_.end()); + it->second += 1; +} + +void DBImpl::MarkLogAsContainingPrepSection(uint64_t log) { + assert(log != 0); + std::lock_guard lock(prep_heap_mutex_); + min_log_with_prep_.push(log); + auto it = prepared_section_completed_.find(log); + if (it == prepared_section_completed_.end()) { + prepared_section_completed_[log] = 0; + } +} + +uint64_t DBImpl::FindMinLogContainingOutstandingPrep() { + + if (!allow_2pc()) { + return 0; + } + + std::lock_guard lock(prep_heap_mutex_); + uint64_t min_log = 0; + + // first we look in the prepared heap where we keep + // track of transactions that have been prepared (written to WAL) + // but not yet committed. + while (!min_log_with_prep_.empty()) { + min_log = min_log_with_prep_.top(); + + auto it = prepared_section_completed_.find(min_log); + + // value was marked as 'deleted' from heap + if (it != prepared_section_completed_.end() && it->second > 0) { + it->second -= 1; + min_log_with_prep_.pop(); + + // back to squere one... + min_log = 0; + continue; + } else { + // found a valid value + break; + } + } + + return min_log; +} + +uint64_t DBImpl::MinLogNumberToKeep() { + uint64_t log_number = versions_->MinLogNumber(); + + if (allow_2pc()) { + // if are 2pc we must consider logs containing prepared + // sections of outstanding transactions. + // + // We must check min logs with outstanding prep before we check + // logs referneces by memtables because a log referenced by the + // first data structure could transition to the second under us. + // + // TODO(horuff): iterating over all column families under db mutex. + // should find more optimial solution + auto min_log_in_prep_heap = FindMinLogContainingOutstandingPrep(); + + if (min_log_in_prep_heap != 0 && min_log_in_prep_heap < log_number) { + log_number = min_log_in_prep_heap; + } + + auto min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable(); + + if (min_log_refed_by_mem != 0 && min_log_refed_by_mem < log_number) { + log_number = min_log_refed_by_mem; + } + } + return log_number; +} + +// * Returns the list of live files in 'sst_live' +// If it's doing full scan: +// * Returns the list of all files in the filesystem in +// 'full_scan_candidate_files'. +// Otherwise, gets obsolete files from VersionSet. +// no_full_scan = true -- never do the full scan using GetChildren() +// force = false -- don't force the full scan, except every +// mutable_db_options_.delete_obsolete_files_period_micros +// force = true -- force the full scan +void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, + bool no_full_scan) { + mutex_.AssertHeld(); + + // if deletion is disabled, do nothing + if (disable_delete_obsolete_files_ > 0) { + return; + } + + bool doing_the_full_scan = false; + + // logic for figurint out if we're doing the full scan + if (no_full_scan) { + doing_the_full_scan = false; + } else if (force || + mutable_db_options_.delete_obsolete_files_period_micros == 0) { + doing_the_full_scan = true; + } else { + const uint64_t now_micros = env_->NowMicros(); + if ((delete_obsolete_files_last_run_ + + mutable_db_options_.delete_obsolete_files_period_micros) < + now_micros) { + doing_the_full_scan = true; + delete_obsolete_files_last_run_ = now_micros; + } + } + + // don't delete files that might be currently written to from compaction + // threads + // Since job_context->min_pending_output is set, until file scan finishes, + // mutex_ cannot be released. Otherwise, we might see no min_pending_output + // here but later find newer generated unfinalized files while scannint. + if (!pending_outputs_.empty()) { + job_context->min_pending_output = *pending_outputs_.begin(); + } else { + // delete all of them + job_context->min_pending_output = std::numeric_limits::max(); + } + + // Get obsolete files. This function will also update the list of + // pending files in VersionSet(). + versions_->GetObsoleteFiles(&job_context->sst_delete_files, + &job_context->manifest_delete_files, + job_context->min_pending_output); + + // store the current filenum, lognum, etc + job_context->manifest_file_number = versions_->manifest_file_number(); + job_context->pending_manifest_file_number = + versions_->pending_manifest_file_number(); + job_context->log_number = MinLogNumberToKeep(); + + job_context->prev_log_number = versions_->prev_log_number(); + + versions_->AddLiveFiles(&job_context->sst_live); + if (doing_the_full_scan) { + for (size_t path_id = 0; path_id < immutable_db_options_.db_paths.size(); + path_id++) { + // set of all files in the directory. We'll exclude files that are still + // alive in the subsequent processings. + std::vector files; + env_->GetChildren(immutable_db_options_.db_paths[path_id].path, + &files); // Ignore errors + for (std::string file : files) { + // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes + job_context->full_scan_candidate_files.emplace_back( + "/" + file, static_cast(path_id)); + } + } + + // Add log files in wal_dir + if (immutable_db_options_.wal_dir != dbname_) { + std::vector log_files; + env_->GetChildren(immutable_db_options_.wal_dir, + &log_files); // Ignore errors + for (std::string log_file : log_files) { + job_context->full_scan_candidate_files.emplace_back(log_file, 0); + } + } + // Add info log files in db_log_dir + if (!immutable_db_options_.db_log_dir.empty() && + immutable_db_options_.db_log_dir != dbname_) { + std::vector info_log_files; + // Ignore errors + env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files); + for (std::string log_file : info_log_files) { + job_context->full_scan_candidate_files.emplace_back(log_file, 0); + } + } + } + + // logs_ is empty when called during recovery, in which case there can't yet + // be any tracked obsolete logs + if (!alive_log_files_.empty() && !logs_.empty()) { + uint64_t min_log_number = job_context->log_number; + size_t num_alive_log_files = alive_log_files_.size(); + // find newly obsoleted log files + while (alive_log_files_.begin()->number < min_log_number) { + auto& earliest = *alive_log_files_.begin(); + if (immutable_db_options_.recycle_log_file_num > + log_recycle_files.size()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "adding log %" PRIu64 " to recycle list\n", + earliest.number); + log_recycle_files.push_back(earliest.number); + } else { + job_context->log_delete_files.push_back(earliest.number); + } + if (job_context->size_log_to_delete == 0) { + job_context->prev_total_log_size = total_log_size_; + job_context->num_alive_log_files = num_alive_log_files; + } + job_context->size_log_to_delete += earliest.size; + total_log_size_ -= earliest.size; + alive_log_files_.pop_front(); + // Current log should always stay alive since it can't have + // number < MinLogNumber(). + assert(alive_log_files_.size()); + } + while (!logs_.empty() && logs_.front().number < min_log_number) { + auto& log = logs_.front(); + if (log.getting_synced) { + log_sync_cv_.Wait(); + // logs_ could have changed while we were waiting. + continue; + } + logs_to_free_.push_back(log.ReleaseWriter()); + logs_.pop_front(); + } + // Current log cannot be obsolete. + assert(!logs_.empty()); + } + + // We're just cleaning up for DB::Write(). + assert(job_context->logs_to_free.empty()); + job_context->logs_to_free = logs_to_free_; + job_context->log_recycle_files.assign(log_recycle_files.begin(), + log_recycle_files.end()); + logs_to_free_.clear(); +} + +namespace { +bool CompareCandidateFile(const JobContext::CandidateFileInfo& first, + const JobContext::CandidateFileInfo& second) { + if (first.file_name > second.file_name) { + return true; + } else if (first.file_name < second.file_name) { + return false; + } else { + return (first.path_id > second.path_id); + } +} +}; // namespace + +// Delete obsolete files and log status and information of file deletion +void DBImpl::DeleteObsoleteFileImpl(Status file_deletion_status, int job_id, + const std::string& fname, FileType type, + uint64_t number, uint32_t path_id) { + if (type == kTableFile) { + file_deletion_status = + DeleteSSTFile(&immutable_db_options_, fname, path_id); + } else { + file_deletion_status = env_->DeleteFile(fname); + } + if (file_deletion_status.ok()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id, + fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } else if (env_->FileExists(fname).IsNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64 + " -- %s\n", + job_id, fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n", + job_id, fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } + if (type == kTableFile) { + EventHelpers::LogAndNotifyTableFileDeletion( + &event_logger_, job_id, number, fname, file_deletion_status, GetName(), + immutable_db_options_.listeners); + } +} + +// Diffs the files listed in filenames and those that do not +// belong to live files are posibly removed. Also, removes all the +// files in sst_delete_files and log_delete_files. +// It is not necessary to hold the mutex when invoking this method. +void DBImpl::PurgeObsoleteFiles(const JobContext& state, bool schedule_only) { + // we'd better have sth to delete + assert(state.HaveSomethingToDelete()); + + // this checks if FindObsoleteFiles() was run before. If not, don't do + // PurgeObsoleteFiles(). If FindObsoleteFiles() was run, we need to also + // run PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true + if (state.manifest_file_number == 0) { + return; + } + + // Now, convert live list to an unordered map, WITHOUT mutex held; + // set is slow. + std::unordered_map sst_live_map; + for (const FileDescriptor& fd : state.sst_live) { + sst_live_map[fd.GetNumber()] = &fd; + } + std::unordered_set log_recycle_files_set( + state.log_recycle_files.begin(), state.log_recycle_files.end()); + + auto candidate_files = state.full_scan_candidate_files; + candidate_files.reserve( + candidate_files.size() + state.sst_delete_files.size() + + state.log_delete_files.size() + state.manifest_delete_files.size()); + // We may ignore the dbname when generating the file names. + const char* kDumbDbName = ""; + for (auto file : state.sst_delete_files) { + candidate_files.emplace_back( + MakeTableFileName(kDumbDbName, file->fd.GetNumber()), + file->fd.GetPathId()); + delete file; + } + + for (auto file_num : state.log_delete_files) { + if (file_num > 0) { + candidate_files.emplace_back(LogFileName(kDumbDbName, file_num), 0); + } + } + for (const auto& filename : state.manifest_delete_files) { + candidate_files.emplace_back(filename, 0); + } + + // dedup state.candidate_files so we don't try to delete the same + // file twice + std::sort(candidate_files.begin(), candidate_files.end(), + CompareCandidateFile); + candidate_files.erase( + std::unique(candidate_files.begin(), candidate_files.end()), + candidate_files.end()); + + if (state.prev_total_log_size > 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Try to delete WAL files size %" PRIu64 + ", prev total WAL file size %" PRIu64 + ", number of live WAL files %" ROCKSDB_PRIszt ".\n", + state.job_id, state.size_log_to_delete, + state.prev_total_log_size, state.num_alive_log_files); + } + + std::vector old_info_log_files; + InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(), + dbname_); + for (const auto& candidate_file : candidate_files) { + std::string to_delete = candidate_file.file_name; + uint32_t path_id = candidate_file.path_id; + uint64_t number; + FileType type; + // Ignore file if we cannot recognize it. + if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) { + continue; + } + + bool keep = true; + switch (type) { + case kLogFile: + keep = ((number >= state.log_number) || + (number == state.prev_log_number) || + (log_recycle_files_set.find(number) != + log_recycle_files_set.end())); + break; + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (can happen during manifest roll) + keep = (number >= state.manifest_file_number); + break; + case kTableFile: + // If the second condition is not there, this makes + // DontDeletePendingOutputs fail + keep = (sst_live_map.find(number) != sst_live_map.end()) || + number >= state.min_pending_output; + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live". + // Also, SetCurrentFile creates a temp file when writing out new + // manifest, which is equal to state.pending_manifest_file_number. We + // should not delete that file + // + // TODO(yhchiang): carefully modify the third condition to safely + // remove the temp options files. + keep = (sst_live_map.find(number) != sst_live_map.end()) || + (number == state.pending_manifest_file_number) || + (to_delete.find(kOptionsFileNamePrefix) != std::string::npos); + break; + case kInfoLogFile: + keep = true; + if (number != 0) { + old_info_log_files.push_back(to_delete); + } + break; + case kCurrentFile: + case kDBLockFile: + case kIdentityFile: + case kMetaDatabase: + case kOptionsFile: + keep = true; + break; + } + + if (keep) { + continue; + } + + std::string fname; + if (type == kTableFile) { + // evict from cache + TableCache::Evict(table_cache_.get(), number); + fname = TableFileName(immutable_db_options_.db_paths, number, path_id); + } else { + fname = ((type == kLogFile) ? immutable_db_options_.wal_dir : dbname_) + + "/" + to_delete; + } + +#ifndef ROCKSDB_LITE + if (type == kLogFile && (immutable_db_options_.wal_ttl_seconds > 0 || + immutable_db_options_.wal_size_limit_mb > 0)) { + wal_manager_.ArchiveWALFile(fname, number); + continue; + } +#endif // !ROCKSDB_LITE + + Status file_deletion_status; + if (schedule_only) { + InstrumentedMutexLock guard_lock(&mutex_); + SchedulePendingPurge(fname, type, number, path_id, state.job_id); + } else { + DeleteObsoleteFileImpl(file_deletion_status, state.job_id, fname, type, + number, path_id); + } + } + + // Delete old info log files. + size_t old_info_log_file_count = old_info_log_files.size(); + if (old_info_log_file_count != 0 && + old_info_log_file_count >= immutable_db_options_.keep_log_file_num) { + std::sort(old_info_log_files.begin(), old_info_log_files.end()); + size_t end = + old_info_log_file_count - immutable_db_options_.keep_log_file_num; + for (unsigned int i = 0; i <= end; i++) { + std::string& to_delete = old_info_log_files.at(i); + std::string full_path_to_delete = + (immutable_db_options_.db_log_dir.empty() + ? dbname_ + : immutable_db_options_.db_log_dir) + + "/" + to_delete; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Delete info log file %s\n", state.job_id, + full_path_to_delete.c_str()); + Status s = env_->DeleteFile(full_path_to_delete); + if (!s.ok()) { + if (env_->FileExists(full_path_to_delete).IsNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[JOB %d] Tried to delete non-existing info log file %s FAILED " + "-- %s\n", + state.job_id, to_delete.c_str(), s.ToString().c_str()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "[JOB %d] Delete info log file %s FAILED -- %s\n", + state.job_id, to_delete.c_str(), + s.ToString().c_str()); + } + } + } + } +#ifndef ROCKSDB_LITE + wal_manager_.PurgeObsoleteWALFiles(); +#endif // ROCKSDB_LITE + LogFlush(immutable_db_options_.info_log); +} + +void DBImpl::DeleteObsoleteFiles() { + mutex_.AssertHeld(); + JobContext job_context(next_job_id_.fetch_add(1)); + FindObsoleteFiles(&job_context, true); + + mutex_.Unlock(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); +} +} // namespace rocksdb diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc new file mode 100644 index 000000000..80d6f0f8b --- /dev/null +++ b/db/db_impl_open.cc @@ -0,0 +1,1081 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif +#include + +#include "db/builder.h" +#include "rocksdb/wal_filter.h" +#include "util/options_helper.h" +#include "util/sst_file_manager_impl.h" +#include "util/sync_point.h" + +namespace rocksdb { +Options SanitizeOptions(const std::string& dbname, + const Options& src) { + auto db_options = SanitizeOptions(dbname, DBOptions(src)); + ImmutableDBOptions immutable_db_options(db_options); + auto cf_options = + SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src)); + return Options(db_options, cf_options); +} + +DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { + DBOptions result(src); + + // result.max_open_files means an "infinite" open files. + if (result.max_open_files != -1) { + int max_max_open_files = port::GetMaxOpenFiles(); + if (max_max_open_files == -1) { + max_max_open_files = 1000000; + } + ClipToRange(&result.max_open_files, 20, max_max_open_files); + } + + if (result.info_log == nullptr) { + Status s = CreateLoggerFromOptions(dbname, result, &result.info_log); + if (!s.ok()) { + // No place suitable for logging + result.info_log = nullptr; + } + } + + if (!result.write_buffer_manager) { + result.write_buffer_manager.reset( + new WriteBufferManager(result.db_write_buffer_size)); + } + if (result.base_background_compactions == -1) { + result.base_background_compactions = result.max_background_compactions; + } + if (result.base_background_compactions > result.max_background_compactions) { + result.base_background_compactions = result.max_background_compactions; + } + result.env->IncBackgroundThreadsIfNeeded(src.max_background_compactions, + Env::Priority::LOW); + result.env->IncBackgroundThreadsIfNeeded(src.max_background_flushes, + Env::Priority::HIGH); + + if (result.rate_limiter.get() != nullptr) { + if (result.bytes_per_sync == 0) { + result.bytes_per_sync = 1024 * 1024; + } + } + + if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) { + result.recycle_log_file_num = false; + } + + if (result.recycle_log_file_num && + (result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery || + result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) { + // kPointInTimeRecovery is indistinguishable from + // kTolerateCorruptedTailRecords in recycle mode since we define + // the "end" of the log as the first corrupt record we encounter. + // kAbsoluteConsistency doesn't make sense because even a clean + // shutdown leaves old junk at the end of the log file. + result.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; + } + + if (result.wal_dir.empty()) { + // Use dbname as default + result.wal_dir = dbname; + } + if (result.wal_dir.back() == '/') { + result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); + } + + if (result.db_paths.size() == 0) { + result.db_paths.emplace_back(dbname, std::numeric_limits::max()); + } + + if (result.use_direct_reads && result.compaction_readahead_size == 0) { + result.compaction_readahead_size = 1024 * 1024 * 2; + } + + if (result.compaction_readahead_size > 0) { + result.new_table_reader_for_compaction_inputs = true; + } + + // Force flush on DB open if 2PC is enabled, since with 2PC we have no + // guarantee that consecutive log files have consecutive sequence id, which + // make recovery complicated. + if (result.allow_2pc) { + result.avoid_flush_during_recovery = false; + } + + return result; +} + +namespace { + +Status SanitizeOptionsByTable( + const DBOptions& db_opts, + const std::vector& column_families) { + Status s; + for (auto cf : column_families) { + s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} + +static Status ValidateOptions( + const DBOptions& db_options, + const std::vector& column_families) { + Status s; + + for (auto& cfd : column_families) { + s = CheckCompressionSupported(cfd.options); + if (s.ok() && db_options.allow_concurrent_memtable_write) { + s = CheckConcurrentWritesSupported(cfd.options); + } + if (!s.ok()) { + return s; + } + if (db_options.db_paths.size() > 1) { + if ((cfd.options.compaction_style != kCompactionStyleUniversal) && + (cfd.options.compaction_style != kCompactionStyleLevel)) { + return Status::NotSupported( + "More than one DB paths are only supported in " + "universal and level compaction styles. "); + } + } + } + + if (db_options.db_paths.size() > 4) { + return Status::NotSupported( + "More than four DB paths are not supported yet. "); + } + + if (db_options.allow_mmap_reads && db_options.use_direct_reads) { + // Protect against assert in PosixMMapReadableFile constructor + return Status::NotSupported( + "If memory mapped reads (allow_mmap_reads) are enabled " + "then direct I/O reads (use_direct_reads) must be disabled. "); + } + + if (db_options.allow_mmap_writes && db_options.use_direct_writes) { + return Status::NotSupported( + "If memory mapped writes (allow_mmap_writes) are enabled " + "then direct I/O writes (use_direct_writes) must be disabled. "); + } + + if (db_options.keep_log_file_num == 0) { + return Status::InvalidArgument("keep_log_file_num must be greater than 0"); + } + + return Status::OK(); +} +} // namespace +Status DBImpl::NewDB() { + VersionEdit new_db; + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + Status s; + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n"); + const std::string manifest = DescriptorFileName(dbname_, 1); + { + unique_ptr file; + EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_); + s = NewWritableFile(env_, manifest, &file, env_options); + if (!s.ok()) { + return s; + } + file->SetPreallocationBlockSize( + immutable_db_options_.manifest_preallocation_size); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options)); + log::Writer log(std::move(file_writer), 0, false); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + if (s.ok()) { + s = SyncManifest(env_, &immutable_db_options_, log.file()); + } + } + if (s.ok()) { + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir()); + } else { + env_->DeleteFile(manifest); + } + return s; +} + +Status DBImpl::Directories::CreateAndNewDirectory( + Env* env, const std::string& dirname, + std::unique_ptr* directory) const { + // We call CreateDirIfMissing() as the directory may already exist (if we + // are reopening a DB), when this happens we don't want creating the + // directory to cause an error. However, we need to check if creating the + // directory fails or else we may get an obscure message about the lock + // file not existing. One real-world example of this occurring is if + // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. + // when dbname_ is "dir/db" but when "dir" doesn't exist. + Status s = env->CreateDirIfMissing(dirname); + if (!s.ok()) { + return s; + } + return env->NewDirectory(dirname, directory); +} + +Status DBImpl::Directories::SetDirectories( + Env* env, const std::string& dbname, const std::string& wal_dir, + const std::vector& data_paths) { + Status s = CreateAndNewDirectory(env, dbname, &db_dir_); + if (!s.ok()) { + return s; + } + if (!wal_dir.empty() && dbname != wal_dir) { + s = CreateAndNewDirectory(env, wal_dir, &wal_dir_); + if (!s.ok()) { + return s; + } + } + + data_dirs_.clear(); + for (auto& p : data_paths) { + const std::string db_path = p.path; + if (db_path == dbname) { + data_dirs_.emplace_back(nullptr); + } else { + std::unique_ptr path_directory; + s = CreateAndNewDirectory(env, db_path, &path_directory); + if (!s.ok()) { + return s; + } + data_dirs_.emplace_back(path_directory.release()); + } + } + assert(data_dirs_.size() == data_paths.size()); + return Status::OK(); +} + +Status DBImpl::Recover( + const std::vector& column_families, bool read_only, + bool error_if_log_file_exist, bool error_if_data_exists_in_logs) { + mutex_.AssertHeld(); + + bool is_new_db = false; + assert(db_lock_ == nullptr); + if (!read_only) { + Status s = directories_.SetDirectories(env_, dbname_, + immutable_db_options_.wal_dir, + immutable_db_options_.db_paths); + if (!s.ok()) { + return s; + } + + s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + return s; + } + + s = env_->FileExists(CurrentFileName(dbname_)); + if (s.IsNotFound()) { + if (immutable_db_options_.create_if_missing) { + s = NewDB(); + is_new_db = true; + if (!s.ok()) { + return s; + } + } else { + return Status::InvalidArgument( + dbname_, "does not exist (create_if_missing is false)"); + } + } else if (s.ok()) { + if (immutable_db_options_.error_if_exists) { + return Status::InvalidArgument( + dbname_, "exists (error_if_exists is true)"); + } + } else { + // Unexpected error reading file + assert(s.IsIOError()); + return s; + } + // Check for the IDENTITY file and create it if not there + s = env_->FileExists(IdentityFileName(dbname_)); + if (s.IsNotFound()) { + s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + } else if (!s.ok()) { + assert(s.IsIOError()); + return s; + } + } + + Status s = versions_->Recover(column_families, read_only); + if (immutable_db_options_.paranoid_checks && s.ok()) { + s = CheckConsistency(); + } + if (s.ok()) { + SequenceNumber next_sequence(kMaxSequenceNumber); + default_cf_handle_ = new ColumnFamilyHandleImpl( + versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); + default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); + single_column_family_mode_ = + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; + + // Recover from all newer log files than the ones named in the + // descriptor (new log files may have been added by the previous + // incarnation without registering them in the descriptor). + // + // Note that prev_log_number() is no longer used, but we pay + // attention to it in case we are recovering a database + // produced by an older version of rocksdb. + std::vector filenames; + s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames); + if (!s.ok()) { + return s; + } + + std::vector logs; + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) { + if (is_new_db) { + return Status::Corruption( + "While creating a new Db, wal_dir contains " + "existing log file: ", + filenames[i]); + } else { + logs.push_back(number); + } + } + } + + if (logs.size() > 0) { + if (error_if_log_file_exist) { + return Status::Corruption( + "The db was opened in readonly mode with error_if_log_file_exist" + "flag but a log file already exists"); + } else if (error_if_data_exists_in_logs) { + for (auto& log : logs) { + std::string fname = LogFileName(immutable_db_options_.wal_dir, log); + uint64_t bytes; + s = env_->GetFileSize(fname, &bytes); + if (s.ok()) { + if (bytes > 0) { + return Status::Corruption( + "error_if_data_exists_in_logs is set but there are data " + " in log files."); + } + } + } + } + } + + if (!logs.empty()) { + // Recover in the order in which the logs were generated + std::sort(logs.begin(), logs.end()); + s = RecoverLogFiles(logs, &next_sequence, read_only); + if (!s.ok()) { + // Clear memtables if recovery failed + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + } + } + } + } + + // Initial value + max_total_in_memory_state_ = 0; + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + + return s; +} + +// REQUIRES: log_numbers are sorted in ascending order +Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* next_sequence, bool read_only) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + Status* status; // nullptr if immutable_db_options_.paranoid_checks==false + virtual void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", + (this->status == nullptr ? "(ignoring error) " : ""), + fname, static_cast(bytes), s.ToString().c_str()); + if (this->status != nullptr && this->status->ok()) { + *this->status = s; + } + } + }; + + mutex_.AssertHeld(); + Status status; + std::unordered_map version_edits; + // no need to refcount because iteration is under mutex + for (auto cfd : *versions_->GetColumnFamilySet()) { + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + version_edits.insert({cfd->GetID(), edit}); + } + int job_id = next_job_id_.fetch_add(1); + { + auto stream = event_logger_.Log(); + stream << "job" << job_id << "event" + << "recovery_started"; + stream << "log_files"; + stream.StartArray(); + for (auto log_number : log_numbers) { + stream << log_number; + } + stream.EndArray(); + } + +#ifndef ROCKSDB_LITE + if (immutable_db_options_.wal_filter != nullptr) { + std::map cf_name_id_map; + std::map cf_lognumber_map; + for (auto cfd : *versions_->GetColumnFamilySet()) { + cf_name_id_map.insert( + std::make_pair(cfd->GetName(), cfd->GetID())); + cf_lognumber_map.insert( + std::make_pair(cfd->GetID(), cfd->GetLogNumber())); + } + + immutable_db_options_.wal_filter->ColumnFamilyLogNumberMap(cf_lognumber_map, + cf_name_id_map); + } +#endif + + bool stop_replay_by_wal_filter = false; + bool stop_replay_for_corruption = false; + bool flushed = false; + for (auto log_number : log_numbers) { + // The previous incarnation may not have written any MANIFEST + // records after allocating this log number. So we manually + // update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsedDuringRecovery(log_number); + // Open the log file + std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Recovering log #%" PRIu64 " mode %d", log_number, + immutable_db_options_.wal_recovery_mode); + auto logFileDropped = [this, &fname]() { + uint64_t bytes; + if (env_->GetFileSize(fname, &bytes).ok()) { + auto info_log = immutable_db_options_.info_log.get(); + ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(), + static_cast(bytes)); + } + }; + if (stop_replay_by_wal_filter) { + logFileDropped(); + continue; + } + + unique_ptr file_reader; + { + unique_ptr file; + status = env_->NewSequentialFile(fname, &file, env_options_); + if (!status.ok()) { + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } else { + // Fail with one log file, but that's ok. + // Try next one. + continue; + } + } + file_reader.reset(new SequentialFileReader(std::move(file))); + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = immutable_db_options_.info_log.get(); + reporter.fname = fname.c_str(); + if (!immutable_db_options_.paranoid_checks || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kSkipAnyCorruptedRecords) { + reporter.status = nullptr; + } else { + reporter.status = &status; + } + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(immutable_db_options_.info_log, std::move(file_reader), + &reporter, true /*checksum*/, 0 /*initial_offset*/, + log_number); + + // Determine if we should tolerate incomplete records at the tail end of the + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + + while (!stop_replay_by_wal_filter && + reader.ReadRecord(&record, &scratch, + immutable_db_options_.wal_recovery_mode) && + status.ok()) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + SequenceNumber sequence = WriteBatchInternal::Sequence(&batch); + + if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery) { + // In point-in-time recovery mode, if sequence id of log files are + // consecutive, we continue recovery despite corruption. This could + // happen when we open and write to a corrupted DB, where sequence id + // will start from the last sequence id we recovered. + if (sequence == *next_sequence) { + stop_replay_for_corruption = false; + } + if (stop_replay_for_corruption) { + logFileDropped(); + break; + } + } + +#ifndef ROCKSDB_LITE + if (immutable_db_options_.wal_filter != nullptr) { + WriteBatch new_batch; + bool batch_changed = false; + + WalFilter::WalProcessingOption wal_processing_option = + immutable_db_options_.wal_filter->LogRecordFound( + log_number, fname, batch, &new_batch, &batch_changed); + + switch (wal_processing_option) { + case WalFilter::WalProcessingOption::kContinueProcessing: + // do nothing, proceeed normally + break; + case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: + // skip current record + continue; + case WalFilter::WalProcessingOption::kStopReplay: + // skip current record and stop replay + stop_replay_by_wal_filter = true; + continue; + case WalFilter::WalProcessingOption::kCorruptedRecord: { + status = + Status::Corruption("Corruption reported by Wal Filter ", + immutable_db_options_.wal_filter->Name()); + MaybeIgnoreError(&status); + if (!status.ok()) { + reporter.Corruption(record.size(), status); + continue; + } + break; + } + default: { + assert(false); // unhandled case + status = Status::NotSupported( + "Unknown WalProcessingOption returned" + " by Wal Filter ", + immutable_db_options_.wal_filter->Name()); + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } else { + // Ignore the error with current record processing. + continue; + } + } + } + + if (batch_changed) { + // Make sure that the count in the new batch is + // within the orignal count. + int new_count = WriteBatchInternal::Count(&new_batch); + int original_count = WriteBatchInternal::Count(&batch); + if (new_count > original_count) { + ROCKS_LOG_FATAL( + immutable_db_options_.info_log, + "Recovering log #%" PRIu64 + " mode %d log filter %s returned " + "more records (%d) than original (%d) which is not allowed. " + "Aborting recovery.", + log_number, immutable_db_options_.wal_recovery_mode, + immutable_db_options_.wal_filter->Name(), new_count, + original_count); + status = Status::NotSupported( + "More than original # of records " + "returned by Wal Filter ", + immutable_db_options_.wal_filter->Name()); + return status; + } + // Set the same sequence number in the new_batch + // as the original batch. + WriteBatchInternal::SetSequence(&new_batch, + WriteBatchInternal::Sequence(&batch)); + batch = new_batch; + } + } +#endif // ROCKSDB_LITE + + // If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- + // we just ignore the update. + // That's why we set ignore missing column families to true + bool has_valid_writes = false; + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), &flush_scheduler_, true, + log_number, this, false /* concurrent_memtable_writes */, + next_sequence, &has_valid_writes); + MaybeIgnoreError(&status); + if (!status.ok()) { + // We are treating this as a failure while reading since we read valid + // blocks that do not form coherent data + reporter.Corruption(record.size(), status); + continue; + } + + if (has_valid_writes && !read_only) { + // we can do this because this is called before client has access to the + // DB and there is only a single thread operating on DB + ColumnFamilyData* cfd; + + while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { + cfd->Unref(); + // If this asserts, it means that InsertInto failed in + // filtering updates to already-flushed column families + assert(cfd->GetLogNumber() <= log_number); + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); + if (!status.ok()) { + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + return status; + } + flushed = true; + + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + *next_sequence); + } + } + } + + if (!status.ok()) { + if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kSkipAnyCorruptedRecords) { + // We should ignore all errors unconditionally + status = Status::OK(); + } else if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery) { + // We should ignore the error but not continue replaying + status = Status::OK(); + stop_replay_for_corruption = true; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Point in time recovered to log #%" PRIu64 + " seq #%" PRIu64, + log_number, *next_sequence); + } else { + assert(immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kAbsoluteConsistency); + return status; + } + } + + flush_scheduler_.Clear(); + auto last_sequence = *next_sequence - 1; + if ((*next_sequence != kMaxSequenceNumber) && + (versions_->LastSequence() <= last_sequence)) { + versions_->SetLastSequence(last_sequence); + } + } + + // True if there's any data in the WALs; if not, we can skip re-processing + // them later + bool data_seen = false; + if (!read_only) { + // no need to refcount since client still doesn't have access + // to the DB and can not drop column families while we iterate + auto max_log_number = log_numbers.back(); + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + + if (cfd->GetLogNumber() > max_log_number) { + // Column family cfd has already flushed the data + // from all logs. Memtable has to be empty because + // we filter the updates based on log_number + // (in WriteBatch::InsertInto) + assert(cfd->mem()->GetFirstSequenceNumber() == 0); + assert(edit->NumEntries() == 0); + continue; + } + + // flush the final memtable (if non-empty) + if (cfd->mem()->GetFirstSequenceNumber() != 0) { + // If flush happened in the middle of recovery (e.g. due to memtable + // being full), we flush at the end. Otherwise we'll need to record + // where we were on last flush, which make the logic complicated. + if (flushed || !immutable_db_options_.avoid_flush_during_recovery) { + status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); + if (!status.ok()) { + // Recovery failed + break; + } + flushed = true; + + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + versions_->LastSequence()); + } + data_seen = true; + } + + // write MANIFEST with update + // writing log_number in the manifest means that any log file + // with number strongly less than (log_number + 1) is already + // recovered and should be ignored on next reincarnation. + // Since we already recovered max_log_number, we want all logs + // with numbers `<= max_log_number` (includes this one) to be ignored + if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) { + edit->SetLogNumber(max_log_number + 1); + } + // we must mark the next log number as used, even though it's + // not actually used. that is because VersionSet assumes + // VersionSet::next_file_number_ always to be strictly greater than any + // log number + versions_->MarkFileNumberUsedDuringRecovery(max_log_number + 1); + status = versions_->LogAndApply( + cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_); + if (!status.ok()) { + // Recovery failed + break; + } + } + } + + if (data_seen && !flushed) { + // Mark these as alive so they'll be considered for deletion later by + // FindObsoleteFiles() + for (auto log_number : log_numbers) { + alive_log_files_.push_back(LogFileNumberSize(log_number)); + } + } + + event_logger_.Log() << "job" << job_id << "event" + << "recovery_finished"; + + return status; +} + +Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, + MemTable* mem, VersionEdit* edit) { + mutex_.AssertHeld(); + const uint64_t start_micros = env_->NowMicros(); + FileMetaData meta; + auto pending_outputs_inserted_elem = + CaptureCurrentFileNumberInPendingOutputs(); + meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + Status s; + TableProperties table_properties; + { + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] [WriteLevel0TableForRecovery]" + " Level-0 table #%" PRIu64 ": started", + cfd->GetName().c_str(), meta.fd.GetNumber()); + + // Get the latest mutable cf options while the mutex is still locked + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + bool paranoid_file_checks = + cfd->GetLatestMutableCFOptions()->paranoid_file_checks; + { + mutex_.Unlock(); + + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + + s = BuildTable( + dbname_, env_, *cfd->ioptions(), mutable_cf_options, env_options_, + cfd->table_cache(), iter.get(), + std::unique_ptr(mem->NewRangeTombstoneIterator(ro)), + &meta, cfd->internal_comparator(), + cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(), + snapshot_seqs, earliest_write_conflict_snapshot, + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), + cfd->ioptions()->compression_opts, paranoid_file_checks, + cfd->internal_stats(), TableFileCreationReason::kRecovery, + &event_logger_, job_id); + LogFlush(immutable_db_options_.info_log); + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] [WriteLevel0TableForRecovery]" + " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", + cfd->GetName().c_str(), meta.fd.GetNumber(), + meta.fd.GetFileSize(), s.ToString().c_str()); + mutex_.Lock(); + } + } + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + int level = 0; + if (s.ok() && meta.fd.GetFileSize() > 0) { + edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), + meta.fd.GetFileSize(), meta.smallest, meta.largest, + meta.smallest_seqno, meta.largest_seqno, + meta.marked_for_compaction); + } + + InternalStats::CompactionStats stats(1); + stats.micros = env_->NowMicros() - start_micros; + stats.bytes_written = meta.fd.GetFileSize(); + stats.num_output_files = 1; + cfd->internal_stats()->AddCompactionStats(level, stats); + cfd->internal_stats()->AddCFStats( + InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize()); + RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); + return s; +} + +Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + return s; +} + +Status DB::Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr) { + Status s = SanitizeOptionsByTable(db_options, column_families); + if (!s.ok()) { + return s; + } + + s = ValidateOptions(db_options, column_families); + if (!s.ok()) { + return s; + } + + *dbptr = nullptr; + handles->clear(); + + size_t max_write_buffer_size = 0; + for (auto cf : column_families) { + max_write_buffer_size = + std::max(max_write_buffer_size, cf.options.write_buffer_size); + } + + DBImpl* impl = new DBImpl(db_options, dbname); + s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.wal_dir); + if (s.ok()) { + for (auto db_path : impl->immutable_db_options_.db_paths) { + s = impl->env_->CreateDirIfMissing(db_path.path); + if (!s.ok()) { + break; + } + } + } + + if (!s.ok()) { + delete impl; + return s; + } + + s = impl->CreateArchivalDirectory(); + if (!s.ok()) { + delete impl; + return s; + } + impl->mutex_.Lock(); + // Handles create_if_missing, error_if_exists + s = impl->Recover(column_families); + if (s.ok()) { + uint64_t new_log_number = impl->versions_->NewFileNumber(); + unique_ptr lfile; + EnvOptions soptions(db_options); + EnvOptions opt_env_options = + impl->immutable_db_options_.env->OptimizeForLogWrite( + soptions, BuildDBOptions(impl->immutable_db_options_, + impl->mutable_db_options_)); + s = NewWritableFile( + impl->immutable_db_options_.env, + LogFileName(impl->immutable_db_options_.wal_dir, new_log_number), + &lfile, opt_env_options); + if (s.ok()) { + lfile->SetPreallocationBlockSize( + impl->GetWalPreallocateBlockSize(max_write_buffer_size)); + impl->logfile_number_ = new_log_number; + unique_ptr file_writer( + new WritableFileWriter(std::move(lfile), opt_env_options)); + impl->logs_.emplace_back( + new_log_number, + new log::Writer( + std::move(file_writer), new_log_number, + impl->immutable_db_options_.recycle_log_file_num > 0)); + + // set column family handles + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (cfd != nullptr) { + handles->push_back( + new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + impl->NewThreadStatusCfInfo(cfd); + } else { + if (db_options.create_missing_column_families) { + // missing column family, create it + ColumnFamilyHandle* handle; + impl->mutex_.Unlock(); + s = impl->CreateColumnFamily(cf.options, cf.name, &handle); + impl->mutex_.Lock(); + if (s.ok()) { + handles->push_back(handle); + } else { + break; + } + } else { + s = Status::InvalidArgument("Column family not found: ", cf.name); + break; + } + } + } + } + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + delete impl->InstallSuperVersionAndScheduleWork( + cfd, nullptr, *cfd->GetLatestMutableCFOptions()); + } + impl->alive_log_files_.push_back( + DBImpl::LogFileNumberSize(impl->logfile_number_)); + impl->DeleteObsoleteFiles(); + s = impl->directories_.GetDbDir()->Fsync(); + } + } + + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + if (cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + auto* vstorage = cfd->current()->storage_info(); + for (int i = 1; i < vstorage->num_levels(); ++i) { + int num_files = vstorage->NumLevelFiles(i); + if (num_files > 0) { + s = Status::InvalidArgument( + "Not all files are at level 0. Cannot " + "open with FIFO compaction style."); + break; + } + } + } + if (!cfd->mem()->IsSnapshotSupported()) { + impl->is_snapshot_supported_ = false; + } + if (cfd->ioptions()->merge_operator != nullptr && + !cfd->mem()->IsMergeOperatorSupported()) { + s = Status::InvalidArgument( + "The memtable of column family %s does not support merge operator " + "its options.merge_operator is non-null", cfd->GetName().c_str()); + } + if (!s.ok()) { + break; + } + } + } + TEST_SYNC_POINT("DBImpl::Open:Opened"); + Status persist_options_status; + if (s.ok()) { + // Persist RocksDB Options before scheduling the compaction. + // The WriteOptionsFile() will release and lock the mutex internally. + persist_options_status = impl->WriteOptionsFile(); + + *dbptr = impl; + impl->opened_successfully_ = true; + impl->MaybeScheduleFlushOrCompaction(); + } + impl->mutex_.Unlock(); + +#ifndef ROCKSDB_LITE + auto sfm = static_cast( + impl->immutable_db_options_.sst_file_manager.get()); + if (s.ok() && sfm) { + // Notify SstFileManager about all sst files that already exist in + // db_paths[0] when the DB is opened. + auto& db_path = impl->immutable_db_options_.db_paths[0]; + std::vector existing_files; + impl->immutable_db_options_.env->GetChildren(db_path.path, &existing_files); + for (auto& file_name : existing_files) { + uint64_t file_number; + FileType file_type; + std::string file_path = db_path.path + "/" + file_name; + if (ParseFileName(file_name, &file_number, &file_type) && + file_type == kTableFile) { + sfm->OnAddFile(file_path); + } + } + } +#endif // !ROCKSDB_LITE + + if (s.ok()) { + ROCKS_LOG_INFO(impl->immutable_db_options_.info_log, "DB pointer %p", impl); + LogFlush(impl->immutable_db_options_.info_log); + if (!persist_options_status.ok()) { + if (db_options.fail_if_options_file_error) { + s = Status::IOError( + "DB::Open() failed --- Unable to persist Options file", + persist_options_status.ToString()); + } + ROCKS_LOG_WARN(impl->immutable_db_options_.info_log, + "Unable to persist options in DB::Open() -- %s", + persist_options_status.ToString().c_str()); + } + } + if (!s.ok()) { + for (auto* h : *handles) { + delete h; + } + handles->clear(); + delete impl; + *dbptr = nullptr; + } + return s; +} +} // namespace rocksdb diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc new file mode 100644 index 000000000..49e68d056 --- /dev/null +++ b/db/db_impl_write.cc @@ -0,0 +1,819 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif +#include +#include "util/options_helper.h" +#include "util/perf_context_imp.h" +#include "util/sync_point.h" + +namespace rocksdb { +// Convenience methods +Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + return DB::Put(o, column_family, key, val); +} + +Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + auto cfh = reinterpret_cast(column_family); + if (!cfh->cfd()->ioptions()->merge_operator) { + return Status::NotSupported("Provide a merge_operator when opening DB"); + } else { + return DB::Merge(o, column_family, key, val); + } +} + +Status DBImpl::Delete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, const Slice& key) { + return DB::Delete(write_options, column_family, key); +} + +Status DBImpl::SingleDelete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, + const Slice& key) { + return DB::SingleDelete(write_options, column_family, key); +} + +Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { + return WriteImpl(write_options, my_batch, nullptr, nullptr); +} + +#ifndef ROCKSDB_LITE +Status DBImpl::WriteWithCallback(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback) { + return WriteImpl(write_options, my_batch, callback, nullptr); +} +#endif // ROCKSDB_LITE + +Status DBImpl::WriteImpl(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t* log_used, uint64_t log_ref, + bool disable_memtable) { + if (my_batch == nullptr) { + return Status::Corruption("Batch is nullptr!"); + } + + Status status; + + PERF_TIMER_GUARD(write_pre_and_post_process_time); + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + disable_memtable); + + if (!write_options.disableWAL) { + RecordTick(stats_, WRITE_WITH_WAL); + } + + StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + + write_thread_.JoinBatchGroup(&w); + if (w.state == WriteThread::STATE_PARALLEL_FOLLOWER) { + // we are a non-leader in a parallel group + PERF_TIMER_GUARD(write_memtable_time); + + if (w.ShouldWriteToMemtable()) { + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + WriteBatchInternal::SetSequence(w.batch, w.sequence); + w.status = WriteBatchInternal::InsertInto( + &w, &column_family_memtables, &flush_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, this, + true /*concurrent_memtable_writes*/); + } + + if (write_thread_.CompleteParallelWorker(&w)) { + // we're responsible for early exit + auto last_sequence = w.parallel_group->last_sequence; + versions_->SetLastSequence(last_sequence); + write_thread_.EarlyExitParallelGroup(&w); + } + assert(w.state == WriteThread::STATE_COMPLETED); + // STATE_COMPLETED conditional below handles exit + + status = w.FinalStatus(); + } + if (w.state == WriteThread::STATE_COMPLETED) { + if (log_used != nullptr) { + *log_used = w.log_used; + } + // write is complete and leader has updated sequence + return w.FinalStatus(); + } + // else we are the leader of the write batch group + assert(w.state == WriteThread::STATE_GROUP_LEADER); + + // Once reaches this point, the current writer "w" will try to do its write + // job. It may also pick up some of the remaining writers in the "writers_" + // when it finds suitable, and finish them in the same write batch. + // This is how a write job could be done by the other writer. + WriteContext write_context; + WriteThread::Writer* last_writer = &w; + autovector write_group; + bool logs_getting_synced = false; + + mutex_.Lock(); + + bool need_log_sync = !write_options.disableWAL && write_options.sync; + bool need_log_dir_sync = need_log_sync && !log_dir_synced_; + status = PreprocessWrite(write_options, need_log_sync, &logs_getting_synced, + &write_context); + uint64_t last_sequence = versions_->LastSequence(); + log::Writer* cur_log_writer = logs_.back().writer; + + mutex_.Unlock(); + + // Add to log and apply to memtable. We can release the lock + // during this phase since &w is currently responsible for logging + // and protects against concurrent loggers and concurrent writes + // into memtables + + bool exit_completed_early = false; + last_batch_group_size_ = + write_thread_.EnterAsBatchGroupLeader(&w, &last_writer, &write_group); + + if (status.ok()) { + // Rules for when we can update the memtable concurrently + // 1. supported by memtable + // 2. Puts are not okay if inplace_update_support + // 3. Deletes or SingleDeletes are not okay if filtering deletes + // (controlled by both batch and memtable setting) + // 4. Merges are not okay + // + // Rules 1..3 are enforced by checking the options + // during startup (CheckConcurrentWritesSupported), so if + // options.allow_concurrent_memtable_write is true then they can be + // assumed to be true. Rule 4 is checked for each batch. We could + // relax rules 2 and 3 if we could prevent write batches from referring + // more than once to a particular key. + bool parallel = immutable_db_options_.allow_concurrent_memtable_write && + write_group.size() > 1; + int total_count = 0; + uint64_t total_byte_size = 0; + for (auto writer : write_group) { + if (writer->CheckCallback(this)) { + if (writer->ShouldWriteToMemtable()) { + total_count += WriteBatchInternal::Count(writer->batch); + parallel = parallel && !writer->batch->HasMerge(); + } + + if (writer->ShouldWriteToWAL()) { + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + } + } + } + + const SequenceNumber current_sequence = last_sequence + 1; + last_sequence += total_count; + + // Update stats while we are an exclusive group leader, so we know + // that nobody else can be writing to these particular stats. + // We're optimistic, updating the stats before we successfully + // commit. That lets us release our leader status early in + // some cases. + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1); + RecordTick(stats_, WRITE_DONE_BY_SELF); + auto write_done_by_other = write_group.size() - 1; + if (write_done_by_other > 0) { + stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, + write_done_by_other); + RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other); + } + MeasureTime(stats_, BYTES_PER_WRITE, total_byte_size); + + if (write_options.disableWAL) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + + PERF_TIMER_STOP(write_pre_and_post_process_time); + + if (status.ok() && !write_options.disableWAL) { + PERF_TIMER_GUARD(write_wal_time); + status = WriteToWAL(write_group, cur_log_writer, need_log_sync, + need_log_dir_sync, current_sequence); + if (log_used != nullptr) { + *log_used = logfile_number_; + } + } + + if (status.ok()) { + PERF_TIMER_GUARD(write_memtable_time); + + if (!parallel) { + status = WriteBatchInternal::InsertInto( + write_group, current_sequence, column_family_memtables_.get(), + &flush_scheduler_, write_options.ignore_missing_column_families, + 0 /*log_number*/, this); + + if (status.ok()) { + // There were no write failures. Set leader's status + // in case the write callback returned a non-ok status. + status = w.FinalStatus(); + } + + } else { + WriteThread::ParallelGroup pg; + pg.leader = &w; + pg.last_writer = last_writer; + pg.last_sequence = last_sequence; + pg.early_exit_allowed = !need_log_sync; + pg.running.store(static_cast(write_group.size()), + std::memory_order_relaxed); + write_thread_.LaunchParallelFollowers(&pg, current_sequence); + + if (w.ShouldWriteToMemtable()) { + // do leader write + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + assert(w.sequence == current_sequence); + WriteBatchInternal::SetSequence(w.batch, w.sequence); + w.status = WriteBatchInternal::InsertInto( + &w, &column_family_memtables, &flush_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, + this, true /*concurrent_memtable_writes*/); + } + + // CompleteParallelWorker returns true if this thread should + // handle exit, false means somebody else did + exit_completed_early = !write_thread_.CompleteParallelWorker(&w); + status = w.FinalStatus(); + } + + if (!exit_completed_early && w.status.ok()) { + versions_->SetLastSequence(last_sequence); + if (!need_log_sync) { + write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status); + exit_completed_early = true; + } + } + + // A non-OK status here indicates that the state implied by the + // WAL has diverged from the in-memory state. This could be + // because of a corrupt write_batch (very bad), or because the + // client specified an invalid column family and didn't specify + // ignore_missing_column_families. + // + // Is setting bg_error_ enough here? This will at least stop + // compaction and fail any further writes. + if (!status.ok() && bg_error_.ok() && !w.CallbackFailed()) { + bg_error_ = status; + } + } + } + PERF_TIMER_START(write_pre_and_post_process_time); + + if (immutable_db_options_.paranoid_checks && !status.ok() && + !w.CallbackFailed() && !status.IsBusy() && !status.IsIncomplete()) { + mutex_.Lock(); + if (bg_error_.ok()) { + bg_error_ = status; // stop compaction & fail any further writes + } + mutex_.Unlock(); + } + + if (logs_getting_synced) { + mutex_.Lock(); + MarkLogsSynced(logfile_number_, need_log_dir_sync, status); + mutex_.Unlock(); + } + + if (!exit_completed_early) { + write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status); + } + + return status; +} + +Status DBImpl::PreprocessWrite(const WriteOptions& write_options, + bool need_log_sync, bool* logs_getting_synced, + WriteContext* write_context) { + mutex_.AssertHeld(); + assert(write_context != nullptr && logs_getting_synced != nullptr); + Status status; + + assert(!single_column_family_mode_ || + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1); + if (UNLIKELY(status.ok() && !single_column_family_mode_ && + total_log_size_ > GetMaxTotalWalSize())) { + status = HandleWALFull(write_context); + } + + if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) { + // Before a new memtable is added in SwitchMemtable(), + // write_buffer_manager_->ShouldFlush() will keep returning true. If another + // thread is writing to another DB with the same write buffer, they may also + // be flushed. We may end up with flushing much more DBs than needed. It's + // suboptimal but still correct. + status = HandleWriteBufferFull(write_context); + } + + if (UNLIKELY(status.ok() && !bg_error_.ok())) { + status = bg_error_; + } + + if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { + status = ScheduleFlushes(write_context); + } + + if (UNLIKELY(status.ok() && (write_controller_.IsStopped() || + write_controller_.NeedsDelay()))) { + PERF_TIMER_GUARD(write_delay_time); + // We don't know size of curent batch so that we always use the size + // for previous one. It might create a fairness issue that expiration + // might happen for smaller writes but larger writes can go through. + // Can optimize it if it is an issue. + status = DelayWrite(last_batch_group_size_, write_options); + } + + if (status.ok() && need_log_sync) { + while (logs_.front().getting_synced) { + log_sync_cv_.Wait(); + } + for (auto& log : logs_) { + assert(!log.getting_synced); + log.getting_synced = true; + } + *logs_getting_synced = true; + } + + return status; +} + +Status DBImpl::WriteToWAL(const autovector& write_group, + log::Writer* log_writer, bool need_log_sync, + bool need_log_dir_sync, SequenceNumber sequence) { + Status status; + + WriteBatch* merged_batch = nullptr; + size_t write_with_wal = 0; + if (write_group.size() == 1 && write_group[0]->ShouldWriteToWAL() && + write_group[0]->batch->GetWalTerminationPoint().is_cleared()) { + // we simply write the first WriteBatch to WAL if the group only + // contains one batch, that batch should be written to the WAL, + // and the batch is not wanting to be truncated + merged_batch = write_group[0]->batch; + write_group[0]->log_used = logfile_number_; + write_with_wal = 1; + } else { + // WAL needs all of the batches flattened into a single batch. + // We could avoid copying here with an iov-like AddRecord + // interface + merged_batch = &tmp_batch_; + for (auto writer : write_group) { + if (writer->ShouldWriteToWAL()) { + WriteBatchInternal::Append(merged_batch, writer->batch, + /*WAL_only*/ true); + write_with_wal++; + } + writer->log_used = logfile_number_; + } + } + + WriteBatchInternal::SetSequence(merged_batch, sequence); + + Slice log_entry = WriteBatchInternal::Contents(merged_batch); + status = log_writer->AddRecord(log_entry); + total_log_size_ += log_entry.size(); + alive_log_files_.back().AddSize(log_entry.size()); + log_empty_ = false; + uint64_t log_size = log_entry.size(); + + if (status.ok() && need_log_sync) { + StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); + // It's safe to access logs_ with unlocked mutex_ here because: + // - we've set getting_synced=true for all logs, + // so other threads won't pop from logs_ while we're here, + // - only writer thread can push to logs_, and we're in + // writer thread, so no one will push to logs_, + // - as long as other threads don't modify it, it's safe to read + // from std::deque from multiple threads concurrently. + for (auto& log : logs_) { + status = log.writer->file()->Sync(immutable_db_options_.use_fsync); + if (!status.ok()) { + break; + } + } + if (status.ok() && need_log_dir_sync) { + // We only sync WAL directory the first time WAL syncing is + // requested, so that in case users never turn on WAL sync, + // we can avoid the disk I/O in the write code path. + status = directories_.GetWalDir()->Fsync(); + } + } + + if (merged_batch == &tmp_batch_) { + tmp_batch_.Clear(); + } + if (status.ok()) { + auto stats = default_cf_internal_stats_; + if (need_log_sync) { + stats->AddDBStats(InternalStats::WAL_FILE_SYNCED, 1); + RecordTick(stats_, WAL_FILE_SYNCED); + } + stats->AddDBStats(InternalStats::WAL_FILE_BYTES, log_size); + RecordTick(stats_, WAL_FILE_BYTES, log_size); + stats->AddDBStats(InternalStats::WRITE_WITH_WAL, write_with_wal); + RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); + } + return status; +} + +Status DBImpl::HandleWALFull(WriteContext* write_context) { + mutex_.AssertHeld(); + assert(write_context != nullptr); + Status status; + + if (alive_log_files_.begin()->getting_flushed) { + return status; + } + + auto oldest_alive_log = alive_log_files_.begin()->number; + auto oldest_log_with_uncommited_prep = FindMinLogContainingOutstandingPrep(); + + if (allow_2pc() && + oldest_log_with_uncommited_prep > 0 && + oldest_log_with_uncommited_prep <= oldest_alive_log) { + if (unable_to_flush_oldest_log_) { + // we already attempted to flush all column families dependent on + // the oldest alive log but the log still contained uncommited transactions. + // the oldest alive log STILL contains uncommited transaction so there + // is still nothing that we can do. + return status; + } else { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Unable to release oldest log due to uncommited transaction"); + unable_to_flush_oldest_log_ = true; + } + } else { + // we only mark this log as getting flushed if we have successfully + // flushed all data in this log. If this log contains outstanding prepared + // transactions then we cannot flush this log until those transactions are commited. + unable_to_flush_oldest_log_ = false; + alive_log_files_.begin()->getting_flushed = true; + } + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Flushing all column families with data in WAL number %" PRIu64 + ". Total log size is %" PRIu64 + " while max_total_wal_size is %" PRIu64, + oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize()); + // no need to refcount because drop is happening in write thread, so can't + // happen while we're in the write thread + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (cfd->OldestLogToKeep() <= oldest_alive_log) { + status = SwitchMemtable(cfd, write_context); + if (!status.ok()) { + break; + } + cfd->imm()->FlushRequested(); + SchedulePendingFlush(cfd); + } + } + MaybeScheduleFlushOrCompaction(); + return status; +} + +Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) { + mutex_.AssertHeld(); + assert(write_context != nullptr); + Status status; + + // Before a new memtable is added in SwitchMemtable(), + // write_buffer_manager_->ShouldFlush() will keep returning true. If another + // thread is writing to another DB with the same write buffer, they may also + // be flushed. We may end up with flushing much more DBs than needed. It's + // suboptimal but still correct. + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Flushing column family with largest mem table size. Write buffer is " + "using %" PRIu64 " bytes out of a total of %" PRIu64 ".", + write_buffer_manager_->memory_usage(), + write_buffer_manager_->buffer_size()); + // no need to refcount because drop is happening in write thread, so can't + // happen while we're in the write thread + ColumnFamilyData* cfd_picked = nullptr; + SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber; + + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (!cfd->mem()->IsEmpty()) { + // We only consider active mem table, hoping immutable memtable is + // already in the process of flushing. + uint64_t seq = cfd->mem()->GetCreationSeq(); + if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) { + cfd_picked = cfd; + seq_num_for_cf_picked = seq; + } + } + } + if (cfd_picked != nullptr) { + status = SwitchMemtable(cfd_picked, write_context); + if (status.ok()) { + cfd_picked->imm()->FlushRequested(); + SchedulePendingFlush(cfd_picked); + MaybeScheduleFlushOrCompaction(); + } + } + return status; +} + +uint64_t DBImpl::GetMaxTotalWalSize() const { + mutex_.AssertHeld(); + return mutable_db_options_.max_total_wal_size == 0 + ? 4 * max_total_in_memory_state_ + : mutable_db_options_.max_total_wal_size; +} + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +Status DBImpl::DelayWrite(uint64_t num_bytes, + const WriteOptions& write_options) { + uint64_t time_delayed = 0; + bool delayed = false; + { + StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed); + uint64_t delay = write_controller_.GetDelay(env_, num_bytes); + if (delay > 0) { + if (write_options.no_slowdown) { + return Status::Incomplete(); + } + TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep"); + + mutex_.Unlock(); + // We will delay the write until we have slept for delay ms or + // we don't need a delay anymore + const uint64_t kDelayInterval = 1000; + uint64_t stall_end = sw.start_time() + delay; + while (write_controller_.NeedsDelay()) { + if (env_->NowMicros() >= stall_end) { + // We already delayed this write `delay` microseconds + break; + } + + delayed = true; + // Sleep for 0.001 seconds + env_->SleepForMicroseconds(kDelayInterval); + } + mutex_.Lock(); + } + + while (bg_error_.ok() && write_controller_.IsStopped()) { + if (write_options.no_slowdown) { + return Status::Incomplete(); + } + delayed = true; + TEST_SYNC_POINT("DBImpl::DelayWrite:Wait"); + bg_cv_.Wait(); + } + } + assert(!delayed || !write_options.no_slowdown); + if (delayed) { + default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_STALL_MICROS, + time_delayed); + RecordTick(stats_, STALL_MICROS, time_delayed); + } + + return bg_error_; +} + +Status DBImpl::ScheduleFlushes(WriteContext* context) { + ColumnFamilyData* cfd; + while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { + auto status = SwitchMemtable(cfd, context); + if (cfd->Unref()) { + delete cfd; + } + if (!status.ok()) { + return status; + } + } + return Status::OK(); +} + +#ifndef ROCKSDB_LITE +void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* cfd, + const MemTableInfo& mem_table_info) { + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + + for (auto listener : immutable_db_options_.listeners) { + listener->OnMemTableSealed(mem_table_info); + } +} +#endif // ROCKSDB_LITE + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { + mutex_.AssertHeld(); + unique_ptr lfile; + log::Writer* new_log = nullptr; + MemTable* new_mem = nullptr; + + // Attempt to switch to a new memtable and trigger flush of old. + // Do this without holding the dbmutex lock. + assert(versions_->prev_log_number() == 0); + bool creating_new_log = !log_empty_; + uint64_t recycle_log_number = 0; + if (creating_new_log && immutable_db_options_.recycle_log_file_num && + !log_recycle_files.empty()) { + recycle_log_number = log_recycle_files.front(); + log_recycle_files.pop_front(); + } + uint64_t new_log_number = + creating_new_log ? versions_->NewFileNumber() : logfile_number_; + SuperVersion* new_superversion = nullptr; + const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + + // Set current_memtble_info for memtable sealed callback +#ifndef ROCKSDB_LITE + MemTableInfo memtable_info; + memtable_info.cf_name = cfd->GetName(); + memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber(); + memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber(); + memtable_info.num_entries = cfd->mem()->num_entries(); + memtable_info.num_deletes = cfd->mem()->num_deletes(); +#endif // ROCKSDB_LITE + // Log this later after lock release. It may be outdated, e.g., if background + // flush happens before logging, but that should be ok. + int num_imm_unflushed = cfd->imm()->NumNotFlushed(); + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + const auto preallocate_block_size = + GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size); + mutex_.Unlock(); + Status s; + { + if (creating_new_log) { + EnvOptions opt_env_opt = + env_->OptimizeForLogWrite(env_options_, db_options); + if (recycle_log_number) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "reusing log %" PRIu64 " from recycle list\n", + recycle_log_number); + s = env_->ReuseWritableFile( + LogFileName(immutable_db_options_.wal_dir, new_log_number), + LogFileName(immutable_db_options_.wal_dir, recycle_log_number), + &lfile, opt_env_opt); + } else { + s = NewWritableFile( + env_, LogFileName(immutable_db_options_.wal_dir, new_log_number), + &lfile, opt_env_opt); + } + if (s.ok()) { + // Our final size should be less than write_buffer_size + // (compression, etc) but err on the side of caution. + + // use preallocate_block_size instead + // of calling GetWalPreallocateBlockSize() + lfile->SetPreallocationBlockSize(preallocate_block_size); + unique_ptr file_writer( + new WritableFileWriter(std::move(lfile), opt_env_opt)); + new_log = + new log::Writer(std::move(file_writer), new_log_number, + immutable_db_options_.recycle_log_file_num > 0); + } + } + + if (s.ok()) { + SequenceNumber seq = versions_->LastSequence(); + new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq); + new_superversion = new SuperVersion(); + } + +#ifndef ROCKSDB_LITE + // PLEASE NOTE: We assume that there are no failable operations + // after lock is acquired below since we are already notifying + // client about mem table becoming immutable. + NotifyOnMemTableSealed(cfd, memtable_info); +#endif //ROCKSDB_LITE + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] New memtable created with log file: #%" PRIu64 + ". Immutable memtables: %d.\n", + cfd->GetName().c_str(), new_log_number, num_imm_unflushed); + mutex_.Lock(); + if (!s.ok()) { + // how do we fail if we're not creating new log? + assert(creating_new_log); + assert(!new_mem); + assert(!new_log); + return s; + } + if (creating_new_log) { + logfile_number_ = new_log_number; + assert(new_log != nullptr); + log_empty_ = true; + log_dir_synced_ = false; + logs_.emplace_back(logfile_number_, new_log); + alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); + } + for (auto loop_cfd : *versions_->GetColumnFamilySet()) { + // all this is just optimization to delete logs that + // are no longer needed -- if CF is empty, that means it + // doesn't need that particular log to stay alive, so we just + // advance the log number. no need to persist this in the manifest + if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 && + loop_cfd->imm()->NumNotFlushed() == 0) { + if (creating_new_log) { + loop_cfd->SetLogNumber(logfile_number_); + } + loop_cfd->mem()->SetCreationSeq(versions_->LastSequence()); + } + } + + cfd->mem()->SetNextLogNumber(logfile_number_); + cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_); + new_mem->Ref(); + cfd->SetMemtable(new_mem); + context->superversions_to_free_.push_back(InstallSuperVersionAndScheduleWork( + cfd, new_superversion, mutable_cf_options)); + return s; +} + +size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const { + mutex_.AssertHeld(); + size_t bsize = write_buffer_size / 10 + write_buffer_size; + // Some users might set very high write_buffer_size and rely on + // max_total_wal_size or other parameters to control the WAL size. + if (mutable_db_options_.max_total_wal_size > 0) { + bsize = std::min(bsize, mutable_db_options_.max_total_wal_size); + } + if (immutable_db_options_.db_write_buffer_size > 0) { + bsize = std::min(bsize, immutable_db_options_.db_write_buffer_size); + } + if (immutable_db_options_.write_buffer_manager && + immutable_db_options_.write_buffer_manager->enabled()) { + bsize = std::min( + bsize, immutable_db_options_.write_buffer_manager->buffer_size()); + } + + return bsize; +} + +// Default implementations of convenience methods that subclasses of DB +// can call if they wish +Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + // Pre-allocate size of write batch conservatively. + // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, + // and we allocate 11 extra bytes for key length, as well as value length. + WriteBatch batch(key.size() + value.size() + 24); + batch.Put(column_family, key, value); + return Write(opt, &batch); +} + +Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key) { + WriteBatch batch; + batch.Delete(column_family, key); + return Write(opt, &batch); +} + +Status DB::SingleDelete(const WriteOptions& opt, + ColumnFamilyHandle* column_family, const Slice& key) { + WriteBatch batch; + batch.SingleDelete(column_family, key); + return Write(opt, &batch); +} + +Status DB::DeleteRange(const WriteOptions& opt, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + WriteBatch batch; + batch.DeleteRange(column_family, begin_key, end_key); + return Write(opt, &batch); +} + +Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + WriteBatch batch; + batch.Merge(column_family, key, value); + return Write(opt, &batch); +} +} // namespace rocksdb diff --git a/src.mk b/src.mk index 35c6fbf31..e7f49e365 100644 --- a/src.mk +++ b/src.mk @@ -13,6 +13,10 @@ LIB_SOURCES = \ db/db_filesnapshot.cc \ db/dbformat.cc \ db/db_impl.cc \ + db/db_impl_write.cc \ + db/db_impl_compaction_flush.cc \ + db/db_impl_files.cc \ + db/db_impl_open.cc \ db/db_impl_debug.cc \ db/db_impl_readonly.cc \ db/db_impl_experimental.cc \