From 71e6a34271c691f67747c14a74ffaa587f7c797d Mon Sep 17 00:00:00 2001 From: sdong Date: Tue, 18 Mar 2014 12:25:08 -0700 Subject: [PATCH] Add a DB property to indicate number of background errors encountered Summary: Add a property to calculate number of background errors encountered to help users build their monitoring Test Plan: Add a unit test. make all check Reviewers: haobo, igor, dhruba Reviewed By: igor CC: ljin, nkg-, yhchiang, leveldb Differential Revision: https://reviews.facebook.net/D16959 --- db/db_impl.cc | 22 ++++++++++++++++------ db/db_impl.h | 2 +- db/db_test.cc | 42 ++++++++++++++++++++++++++++++++++++++++++ db/internal_stats.cc | 16 ++++++++++++---- db/internal_stats.h | 20 +++++++++++++++++--- 5 files changed, 88 insertions(+), 14 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 7b77cbaaa..70652b4c7 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -9,6 +9,8 @@ #include "db/db_impl.h" +#define __STDC_FORMAT_MACROS +#include #include #include #include @@ -1806,8 +1808,10 @@ Status DBImpl::WaitForFlushMemTable() { return s; } -Status DBImpl::TEST_FlushMemTable() { - return FlushMemTable(FlushOptions()); +Status DBImpl::TEST_FlushMemTable(bool wait) { + FlushOptions fo; + fo.wait = wait; + return FlushMemTable(fo); } Status DBImpl::TEST_WaitForFlushMemTable() { @@ -1904,10 +1908,13 @@ void DBImpl::BackgroundCallFlush() { // case this is an environmental problem and we do not want to // chew up resources for failed compactions for the duration of // the problem. + uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount(); bg_cv_.SignalAll(); // In case a waiter can proceed despite the error - Log(options_.info_log, "Waiting after background flush error: %s", - s.ToString().c_str()); mutex_.Unlock(); + Log(options_.info_log, + "Waiting after background flush error: %s" + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); log_buffer.FlushBufferToLog(); LogFlush(options_.info_log); env_->SleepForMicroseconds(1000000); @@ -1978,11 +1985,14 @@ void DBImpl::BackgroundCallCompaction() { // case this is an environmental problem and we do not want to // chew up resources for failed compactions for the duration of // the problem. + uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount(); bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); log_buffer.FlushBufferToLog(); - Log(options_.info_log, "Waiting after background compaction error: %s", - s.ToString().c_str()); + Log(options_.info_log, + "Waiting after background compaction error: %s, " + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); LogFlush(options_.info_log); env_->SleepForMicroseconds(1000000); mutex_.Lock(); diff --git a/db/db_impl.h b/db/db_impl.h index 6e6dc425a..dbcfb39aa 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -109,7 +109,7 @@ class DBImpl : public DB { const Slice* end); // Force current memtable contents to be flushed. - Status TEST_FlushMemTable(); + Status TEST_FlushMemTable(bool wait = true); // Wait for memtable compaction Status TEST_WaitForFlushMemTable(); diff --git a/db/db_test.cc b/db/db_test.cc index 8e99bcec9..5c076ce5b 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -4188,6 +4188,11 @@ TEST(DBTest, NoSpace) { dbfull()->TEST_CompactRange(level, nullptr, nullptr); } } + + std::string property_value; + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("5", property_value); + env_->no_space_.Release_Store(nullptr); ASSERT_LT(CountFiles(), num_files + 3); @@ -4196,6 +4201,43 @@ TEST(DBTest, NoSpace) { } while (ChangeCompactOptions()); } +// Check background error counter bumped on flush failures. +TEST(DBTest, NoSpaceFlush) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.max_background_flushes = 1; + Reopen(&options); + + ASSERT_OK(Put("foo", "v1")); + env_->no_space_.Release_Store(env_); // Force out-of-space errors + + std::string property_value; + // Background error count is 0 now. + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("0", property_value); + + dbfull()->TEST_FlushMemTable(false); + + // Wait 300 milliseconds or background-errors turned 1 from 0. + int time_to_sleep_limit = 300000; + while (time_to_sleep_limit > 0) { + int to_sleep = (time_to_sleep_limit > 1000) ? 1000 : time_to_sleep_limit; + time_to_sleep_limit -= to_sleep; + env_->SleepForMicroseconds(to_sleep); + + ASSERT_TRUE( + db_->GetProperty("rocksdb.background-errors", &property_value)); + if (property_value == "1") { + break; + } + } + ASSERT_EQ("1", property_value); + + env_->no_space_.Release_Store(nullptr); + } while (ChangeCompactOptions()); +} + TEST(DBTest, NonWritableFileSystem) { do { Options options = CurrentOptions(); diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 1f81023f6..629941c88 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -30,9 +30,11 @@ DBPropertyType GetPropertyType(const Slice& property) { } else if (in == "num-immutable-mem-table") { return kNumImmutableMemTable; } else if (in == "mem-table-flush-pending") { - return MemtableFlushPending; + return kMemtableFlushPending; } else if (in == "compaction-pending") { - return CompactionPending; + return kCompactionPending; + } else if (in == "background-errors") { + return kBackgroundErrors; } return kUnknown; } @@ -330,15 +332,21 @@ bool InternalStats::GetProperty(DBPropertyType property_type, case kNumImmutableMemTable: *value = std::to_string(imm.size()); return true; - case MemtableFlushPending: + case kMemtableFlushPending: // Return number of mem tables that are ready to flush (made immutable) *value = std::to_string(imm.IsFlushPending() ? 1 : 0); return true; - case CompactionPending: + case kCompactionPending: // 1 if the system already determines at least one compacdtion is needed. // 0 otherwise, *value = std::to_string(current->NeedsCompaction() ? 1 : 0); return true; + ///////////// + case kBackgroundErrors: + // Accumulated number of errors in background flushes or compactions. + *value = std::to_string(GetBackgroundErrorCount()); + return true; + ///////// default: return false; } diff --git a/db/internal_stats.h b/db/internal_stats.h index 5f1a6263a..b6032d014 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -26,9 +26,11 @@ enum DBPropertyType { kStats, // Return general statitistics of DB kSsTables, // Return a human readable string of current SST files kNumImmutableMemTable, // Return number of immutable mem tables - MemtableFlushPending, // Return 1 if mem table flushing is pending, otherwise - // 0. - CompactionPending, // Return 1 if a compaction is pending. Otherwise 0. + kMemtableFlushPending, // Return 1 if mem table flushing is pending, + // otherwise + // 0. + kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0. + kBackgroundErrors, // Return accumulated background errors encountered. kUnknown, }; @@ -49,6 +51,7 @@ class InternalStats { stall_counts_(WRITE_STALLS_ENUM_MAX, 0), stall_leveln_slowdown_(num_levels, 0), stall_leveln_slowdown_count_(num_levels, 0), + bg_error_count_(0), number_levels_(num_levels), statistics_(statistics), env_(env), @@ -116,6 +119,10 @@ class InternalStats { stall_leveln_slowdown_count_[level] += micros; } + uint64_t GetBackgroundErrorCount() const { return bg_error_count_; } + + uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; } + bool GetProperty(DBPropertyType property_type, const Slice& property, std::string* value, VersionSet* version_set, const MemTableList& imm); @@ -158,6 +165,13 @@ class InternalStats { std::vector stall_leveln_slowdown_; std::vector stall_leveln_slowdown_count_; + // Total number of background errors encountered. Every time a flush task + // or compaction task fails, this counter is incremented. The failure can + // be caused by any possible reason, including file system errors, out of + // resources, or input file corruption. Failing when retrying the same flush + // or compaction will cause the counter to increase too. + uint64_t bg_error_count_; + int number_levels_; Statistics* statistics_; Env* env_;