Add a DB property to indicate number of background errors encountered

Summary: Add a property to calculate number of background errors encountered to help users build their monitoring

Test Plan: Add a unit test. make all check

Reviewers: haobo, igor, dhruba

Reviewed By: igor

CC: ljin, nkg-, yhchiang, leveldb

Differential Revision: https://reviews.facebook.net/D16959
main
sdong 11 years ago
parent 1ec72b37b1
commit 71e6a34271
  1. 22
      db/db_impl.cc
  2. 2
      db/db_impl.h
  3. 42
      db/db_test.cc
  4. 16
      db/internal_stats.cc
  5. 20
      db/internal_stats.h

@ -9,6 +9,8 @@
#include "db/db_impl.h" #include "db/db_impl.h"
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <algorithm> #include <algorithm>
#include <climits> #include <climits>
#include <cstdio> #include <cstdio>
@ -1806,8 +1808,10 @@ Status DBImpl::WaitForFlushMemTable() {
return s; return s;
} }
Status DBImpl::TEST_FlushMemTable() { Status DBImpl::TEST_FlushMemTable(bool wait) {
return FlushMemTable(FlushOptions()); FlushOptions fo;
fo.wait = wait;
return FlushMemTable(fo);
} }
Status DBImpl::TEST_WaitForFlushMemTable() { Status DBImpl::TEST_WaitForFlushMemTable() {
@ -1904,10 +1908,13 @@ void DBImpl::BackgroundCallFlush() {
// case this is an environmental problem and we do not want to // case this is an environmental problem and we do not want to
// chew up resources for failed compactions for the duration of // chew up resources for failed compactions for the duration of
// the problem. // the problem.
uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount();
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
Log(options_.info_log, "Waiting after background flush error: %s",
s.ToString().c_str());
mutex_.Unlock(); mutex_.Unlock();
Log(options_.info_log,
"Waiting after background flush error: %s"
"Accumulated background error counts: %" PRIu64,
s.ToString().c_str(), error_cnt);
log_buffer.FlushBufferToLog(); log_buffer.FlushBufferToLog();
LogFlush(options_.info_log); LogFlush(options_.info_log);
env_->SleepForMicroseconds(1000000); env_->SleepForMicroseconds(1000000);
@ -1978,11 +1985,14 @@ void DBImpl::BackgroundCallCompaction() {
// case this is an environmental problem and we do not want to // case this is an environmental problem and we do not want to
// chew up resources for failed compactions for the duration of // chew up resources for failed compactions for the duration of
// the problem. // the problem.
uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount();
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
mutex_.Unlock(); mutex_.Unlock();
log_buffer.FlushBufferToLog(); log_buffer.FlushBufferToLog();
Log(options_.info_log, "Waiting after background compaction error: %s", Log(options_.info_log,
s.ToString().c_str()); "Waiting after background compaction error: %s, "
"Accumulated background error counts: %" PRIu64,
s.ToString().c_str(), error_cnt);
LogFlush(options_.info_log); LogFlush(options_.info_log);
env_->SleepForMicroseconds(1000000); env_->SleepForMicroseconds(1000000);
mutex_.Lock(); mutex_.Lock();

@ -109,7 +109,7 @@ class DBImpl : public DB {
const Slice* end); const Slice* end);
// Force current memtable contents to be flushed. // Force current memtable contents to be flushed.
Status TEST_FlushMemTable(); Status TEST_FlushMemTable(bool wait = true);
// Wait for memtable compaction // Wait for memtable compaction
Status TEST_WaitForFlushMemTable(); Status TEST_WaitForFlushMemTable();

@ -4188,6 +4188,11 @@ TEST(DBTest, NoSpace) {
dbfull()->TEST_CompactRange(level, nullptr, nullptr); dbfull()->TEST_CompactRange(level, nullptr, nullptr);
} }
} }
std::string property_value;
ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
ASSERT_EQ("5", property_value);
env_->no_space_.Release_Store(nullptr); env_->no_space_.Release_Store(nullptr);
ASSERT_LT(CountFiles(), num_files + 3); ASSERT_LT(CountFiles(), num_files + 3);
@ -4196,6 +4201,43 @@ TEST(DBTest, NoSpace) {
} while (ChangeCompactOptions()); } while (ChangeCompactOptions());
} }
// Check background error counter bumped on flush failures.
TEST(DBTest, NoSpaceFlush) {
do {
Options options = CurrentOptions();
options.env = env_;
options.max_background_flushes = 1;
Reopen(&options);
ASSERT_OK(Put("foo", "v1"));
env_->no_space_.Release_Store(env_); // Force out-of-space errors
std::string property_value;
// Background error count is 0 now.
ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
ASSERT_EQ("0", property_value);
dbfull()->TEST_FlushMemTable(false);
// Wait 300 milliseconds or background-errors turned 1 from 0.
int time_to_sleep_limit = 300000;
while (time_to_sleep_limit > 0) {
int to_sleep = (time_to_sleep_limit > 1000) ? 1000 : time_to_sleep_limit;
time_to_sleep_limit -= to_sleep;
env_->SleepForMicroseconds(to_sleep);
ASSERT_TRUE(
db_->GetProperty("rocksdb.background-errors", &property_value));
if (property_value == "1") {
break;
}
}
ASSERT_EQ("1", property_value);
env_->no_space_.Release_Store(nullptr);
} while (ChangeCompactOptions());
}
TEST(DBTest, NonWritableFileSystem) { TEST(DBTest, NonWritableFileSystem) {
do { do {
Options options = CurrentOptions(); Options options = CurrentOptions();

@ -30,9 +30,11 @@ DBPropertyType GetPropertyType(const Slice& property) {
} else if (in == "num-immutable-mem-table") { } else if (in == "num-immutable-mem-table") {
return kNumImmutableMemTable; return kNumImmutableMemTable;
} else if (in == "mem-table-flush-pending") { } else if (in == "mem-table-flush-pending") {
return MemtableFlushPending; return kMemtableFlushPending;
} else if (in == "compaction-pending") { } else if (in == "compaction-pending") {
return CompactionPending; return kCompactionPending;
} else if (in == "background-errors") {
return kBackgroundErrors;
} }
return kUnknown; return kUnknown;
} }
@ -330,15 +332,21 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
case kNumImmutableMemTable: case kNumImmutableMemTable:
*value = std::to_string(imm.size()); *value = std::to_string(imm.size());
return true; return true;
case MemtableFlushPending: case kMemtableFlushPending:
// Return number of mem tables that are ready to flush (made immutable) // Return number of mem tables that are ready to flush (made immutable)
*value = std::to_string(imm.IsFlushPending() ? 1 : 0); *value = std::to_string(imm.IsFlushPending() ? 1 : 0);
return true; return true;
case CompactionPending: case kCompactionPending:
// 1 if the system already determines at least one compacdtion is needed. // 1 if the system already determines at least one compacdtion is needed.
// 0 otherwise, // 0 otherwise,
*value = std::to_string(current->NeedsCompaction() ? 1 : 0); *value = std::to_string(current->NeedsCompaction() ? 1 : 0);
return true; return true;
/////////////
case kBackgroundErrors:
// Accumulated number of errors in background flushes or compactions.
*value = std::to_string(GetBackgroundErrorCount());
return true;
/////////
default: default:
return false; return false;
} }

@ -26,9 +26,11 @@ enum DBPropertyType {
kStats, // Return general statitistics of DB kStats, // Return general statitistics of DB
kSsTables, // Return a human readable string of current SST files kSsTables, // Return a human readable string of current SST files
kNumImmutableMemTable, // Return number of immutable mem tables kNumImmutableMemTable, // Return number of immutable mem tables
MemtableFlushPending, // Return 1 if mem table flushing is pending, otherwise kMemtableFlushPending, // Return 1 if mem table flushing is pending,
// 0. // otherwise
CompactionPending, // Return 1 if a compaction is pending. Otherwise 0. // 0.
kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0.
kBackgroundErrors, // Return accumulated background errors encountered.
kUnknown, kUnknown,
}; };
@ -49,6 +51,7 @@ class InternalStats {
stall_counts_(WRITE_STALLS_ENUM_MAX, 0), stall_counts_(WRITE_STALLS_ENUM_MAX, 0),
stall_leveln_slowdown_(num_levels, 0), stall_leveln_slowdown_(num_levels, 0),
stall_leveln_slowdown_count_(num_levels, 0), stall_leveln_slowdown_count_(num_levels, 0),
bg_error_count_(0),
number_levels_(num_levels), number_levels_(num_levels),
statistics_(statistics), statistics_(statistics),
env_(env), env_(env),
@ -116,6 +119,10 @@ class InternalStats {
stall_leveln_slowdown_count_[level] += micros; stall_leveln_slowdown_count_[level] += micros;
} }
uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
bool GetProperty(DBPropertyType property_type, const Slice& property, bool GetProperty(DBPropertyType property_type, const Slice& property,
std::string* value, VersionSet* version_set, std::string* value, VersionSet* version_set,
const MemTableList& imm); const MemTableList& imm);
@ -158,6 +165,13 @@ class InternalStats {
std::vector<uint64_t> stall_leveln_slowdown_; std::vector<uint64_t> stall_leveln_slowdown_;
std::vector<uint64_t> stall_leveln_slowdown_count_; std::vector<uint64_t> stall_leveln_slowdown_count_;
// Total number of background errors encountered. Every time a flush task
// or compaction task fails, this counter is incremented. The failure can
// be caused by any possible reason, including file system errors, out of
// resources, or input file corruption. Failing when retrying the same flush
// or compaction will cause the counter to increase too.
uint64_t bg_error_count_;
int number_levels_; int number_levels_;
Statistics* statistics_; Statistics* statistics_;
Env* env_; Env* env_;

Loading…
Cancel
Save