Summary: Possible interleaved execution of background compaction thread calling `FindObsoleteFiles (no full scan) / PurgeObsoleteFiles` and user thread calling `FindObsoleteFiles (full scan) / PurgeObsoleteFiles` can lead to race condition on which RocksDB attempts to delete a file twice. The second attempt will fail and return `IO error`. This may occur to other files, but this PR targets sst. Also add a unit test to verify that this PR fixes the issue. The newly added unit test `obsolete_files_test` has a test case for this scenario, implemented in `ObsoleteFilesTest#RaceForObsoleteFileDeletion`. `TestSyncPoint`s are used to coordinate the interleaving the `user_thread` and background compaction thread. They execute as follows ``` timeline user_thread background_compaction thread t1 | FindObsoleteFiles(full_scan=false) t2 | FindObsoleteFiles(full_scan=true) t3 | PurgeObsoleteFiles t4 | PurgeObsoleteFiles V ``` When `user_thread` invokes `FindObsoleteFiles` with full scan, it collects ALL files in RocksDB directory, including the ones that background compaction thread have collected in its job context. Then `user_thread` will see an IO error when trying to delete these files in `PurgeObsoleteFiles` because background compaction thread has already deleted the file in `PurgeObsoleteFiles`. To fix this, we make RocksDB remember which (SST) files have been found by threads after calling `FindObsoleteFiles` (see `DBImpl#files_grabbed_for_purge_`). Therefore, when another thread calls `FindObsoleteFiles` with full scan, it will not collect such files. ajkr could you take a look and comment? Thanks! Closes https://github.com/facebook/rocksdb/pull/3638 Differential Revision: D7384372 Pulled By: riversand963 fbshipit-source-id: 01489516d60012e722ee65a80e1449e589ce26d3main
parent
90c542347a
commit
1f5def1653
@ -0,0 +1,217 @@ |
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under both the GPLv2 (found in the
|
||||||
|
// COPYING file in the root directory) and Apache 2.0 License
|
||||||
|
// (found in the LICENSE.Apache file in the root directory).
|
||||||
|
//
|
||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE |
||||||
|
|
||||||
|
#include <stdlib.h> |
||||||
|
#include <map> |
||||||
|
#include <string> |
||||||
|
#include <vector> |
||||||
|
#include "db/db_impl.h" |
||||||
|
#include "db/version_set.h" |
||||||
|
#include "db/write_batch_internal.h" |
||||||
|
#include "rocksdb/db.h" |
||||||
|
#include "rocksdb/env.h" |
||||||
|
#include "rocksdb/transaction_log.h" |
||||||
|
#include "util/filename.h" |
||||||
|
#include "util/string_util.h" |
||||||
|
#include "util/sync_point.h" |
||||||
|
#include "util/testharness.h" |
||||||
|
#include "util/testutil.h" |
||||||
|
|
||||||
|
using std::cerr; |
||||||
|
using std::cout; |
||||||
|
using std::endl; |
||||||
|
using std::flush; |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
|
||||||
|
class ObsoleteFilesTest : public testing::Test { |
||||||
|
public: |
||||||
|
std::string dbname_; |
||||||
|
Options options_; |
||||||
|
DB* db_; |
||||||
|
Env* env_; |
||||||
|
int numlevels_; |
||||||
|
|
||||||
|
ObsoleteFilesTest() { |
||||||
|
db_ = nullptr; |
||||||
|
env_ = Env::Default(); |
||||||
|
// Trigger compaction when the number of level 0 files reaches 2.
|
||||||
|
options_.level0_file_num_compaction_trigger = 2; |
||||||
|
options_.disable_auto_compactions = false; |
||||||
|
options_.delete_obsolete_files_period_micros = 0; // always do full purge
|
||||||
|
options_.enable_thread_tracking = true; |
||||||
|
options_.write_buffer_size = 1024*1024*1000; |
||||||
|
options_.target_file_size_base = 1024*1024*1000; |
||||||
|
options_.max_bytes_for_level_base = 1024*1024*1000; |
||||||
|
options_.WAL_ttl_seconds = 300; // Used to test log files
|
||||||
|
options_.WAL_size_limit_MB = 1024; // Used to test log files
|
||||||
|
dbname_ = test::TmpDir() + "/double_deletefile_test"; |
||||||
|
options_.wal_dir = dbname_ + "/wal_files"; |
||||||
|
|
||||||
|
// clean up all the files that might have been there before
|
||||||
|
std::vector<std::string> old_files; |
||||||
|
env_->GetChildren(dbname_, &old_files); |
||||||
|
for (auto file : old_files) { |
||||||
|
env_->DeleteFile(dbname_ + "/" + file); |
||||||
|
} |
||||||
|
env_->GetChildren(options_.wal_dir, &old_files); |
||||||
|
for (auto file : old_files) { |
||||||
|
env_->DeleteFile(options_.wal_dir + "/" + file); |
||||||
|
} |
||||||
|
|
||||||
|
DestroyDB(dbname_, options_); |
||||||
|
numlevels_ = 7; |
||||||
|
EXPECT_OK(ReopenDB(true)); |
||||||
|
} |
||||||
|
|
||||||
|
Status ReopenDB(bool create) { |
||||||
|
delete db_; |
||||||
|
if (create) { |
||||||
|
DestroyDB(dbname_, options_); |
||||||
|
} |
||||||
|
db_ = nullptr; |
||||||
|
options_.create_if_missing = create; |
||||||
|
return DB::Open(options_, dbname_, &db_); |
||||||
|
} |
||||||
|
|
||||||
|
void CloseDB() { |
||||||
|
delete db_; |
||||||
|
db_ = nullptr; |
||||||
|
} |
||||||
|
|
||||||
|
void AddKeys(int numkeys, int startkey) { |
||||||
|
WriteOptions options; |
||||||
|
options.sync = false; |
||||||
|
for (int i = startkey; i < (numkeys + startkey) ; i++) { |
||||||
|
std::string temp = ToString(i); |
||||||
|
Slice key(temp); |
||||||
|
Slice value(temp); |
||||||
|
ASSERT_OK(db_->Put(options, key, value)); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
int numKeysInLevels( |
||||||
|
std::vector<LiveFileMetaData> &metadata, |
||||||
|
std::vector<int> *keysperlevel = nullptr) { |
||||||
|
|
||||||
|
if (keysperlevel != nullptr) { |
||||||
|
keysperlevel->resize(numlevels_); |
||||||
|
} |
||||||
|
|
||||||
|
int numKeys = 0; |
||||||
|
for (size_t i = 0; i < metadata.size(); i++) { |
||||||
|
int startkey = atoi(metadata[i].smallestkey.c_str()); |
||||||
|
int endkey = atoi(metadata[i].largestkey.c_str()); |
||||||
|
int numkeysinfile = (endkey - startkey + 1); |
||||||
|
numKeys += numkeysinfile; |
||||||
|
if (keysperlevel != nullptr) { |
||||||
|
(*keysperlevel)[(int)metadata[i].level] += numkeysinfile; |
||||||
|
} |
||||||
|
fprintf(stderr, "level %d name %s smallest %s largest %s\n", |
||||||
|
metadata[i].level, metadata[i].name.c_str(), |
||||||
|
metadata[i].smallestkey.c_str(), |
||||||
|
metadata[i].largestkey.c_str()); |
||||||
|
} |
||||||
|
return numKeys; |
||||||
|
} |
||||||
|
|
||||||
|
void createLevel0Files(int numFiles, int numKeysPerFile) { |
||||||
|
int startKey = 0; |
||||||
|
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); |
||||||
|
for (int i = 0; i < numFiles; i++) { |
||||||
|
AddKeys(numKeysPerFile, startKey); |
||||||
|
startKey += numKeysPerFile; |
||||||
|
ASSERT_OK(dbi->TEST_FlushMemTable()); |
||||||
|
ASSERT_OK(dbi->TEST_WaitForFlushMemTable()); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
void CheckFileTypeCounts(std::string& dir, |
||||||
|
int required_log, |
||||||
|
int required_sst, |
||||||
|
int required_manifest) { |
||||||
|
std::vector<std::string> filenames; |
||||||
|
env_->GetChildren(dir, &filenames); |
||||||
|
|
||||||
|
int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; |
||||||
|
for (auto file : filenames) { |
||||||
|
uint64_t number; |
||||||
|
FileType type; |
||||||
|
if (ParseFileName(file, &number, &type)) { |
||||||
|
log_cnt += (type == kLogFile); |
||||||
|
sst_cnt += (type == kTableFile); |
||||||
|
manifest_cnt += (type == kDescriptorFile); |
||||||
|
} |
||||||
|
} |
||||||
|
ASSERT_EQ(required_log, log_cnt); |
||||||
|
ASSERT_EQ(required_sst, sst_cnt); |
||||||
|
ASSERT_EQ(required_manifest, manifest_cnt); |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) { |
||||||
|
createLevel0Files(2, 50000); |
||||||
|
CheckFileTypeCounts(options_.wal_dir, 1, 0, 0); |
||||||
|
|
||||||
|
SyncPoint::GetInstance()->LoadDependency({ |
||||||
|
{"DBImpl::BackgroundCallCompaction:FoundObsoleteFiles", |
||||||
|
"ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"}, |
||||||
|
{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", |
||||||
|
"ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"}, |
||||||
|
}); |
||||||
|
SyncPoint::GetInstance()->SetCallBack( |
||||||
|
"DBImpl::DeleteObsoleteFileImpl:AfterDeletion", [&](void* arg) { |
||||||
|
Status* p_status = reinterpret_cast<Status*>(arg); |
||||||
|
ASSERT_TRUE(p_status->ok()); |
||||||
|
}); |
||||||
|
SyncPoint::GetInstance()->SetCallBack( |
||||||
|
"DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) { |
||||||
|
std::vector<uint64_t>* files_grabbed_for_purge_ptr = |
||||||
|
reinterpret_cast<std::vector<uint64_t>*>(arg); |
||||||
|
ASSERT_TRUE(files_grabbed_for_purge_ptr->empty()); |
||||||
|
}); |
||||||
|
SyncPoint::GetInstance()->EnableProcessing(); |
||||||
|
|
||||||
|
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); |
||||||
|
port::Thread user_thread([&]() { |
||||||
|
JobContext jobCxt(0); |
||||||
|
TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"); |
||||||
|
dbi->TEST_LockMutex(); |
||||||
|
dbi->FindObsoleteFiles(&jobCxt, |
||||||
|
true /* force=true */, false /* no_full_scan=false */); |
||||||
|
dbi->TEST_UnlockMutex(); |
||||||
|
TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"); |
||||||
|
dbi->PurgeObsoleteFiles(jobCxt); |
||||||
|
jobCxt.Clean(); |
||||||
|
}); |
||||||
|
|
||||||
|
user_thread.join(); |
||||||
|
|
||||||
|
CloseDB(); |
||||||
|
} |
||||||
|
|
||||||
|
} //namespace rocksdb
|
||||||
|
|
||||||
|
int main(int argc, char** argv) { |
||||||
|
::testing::InitGoogleTest(&argc, argv); |
||||||
|
return RUN_ALL_TESTS(); |
||||||
|
} |
||||||
|
|
||||||
|
#else |
||||||
|
#include <stdio.h> |
||||||
|
|
||||||
|
int main(int argc, char** argv) { |
||||||
|
fprintf(stderr, |
||||||
|
"SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n"); |
||||||
|
return 0; |
||||||
|
} |
||||||
|
|
||||||
|
#endif // !ROCKSDB_LITE
|
Loading…
Reference in new issue