|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "file/filename.h"
|
|
|
|
#include <cinttypes>
|
|
|
|
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <vector>
|
|
|
|
#include "file/writable_file_writer.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "test_util/sync_point.h"
|
|
|
|
#include "util/stop_watch.h"
|
|
|
|
#include "util/string_util.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
const std::string kCurrentFileName = "CURRENT";
|
|
|
|
const std::string kOptionsFileNamePrefix = "OPTIONS-";
|
|
|
|
const std::string kTempFileNameSuffix = "dbtmp";
|
|
|
|
|
|
|
|
static const std::string kRocksDbTFileExt = "sst";
|
|
|
|
static const std::string kLevelDbTFileExt = "ldb";
|
|
|
|
static const std::string kRocksDBBlobFileExt = "blob";
|
|
|
|
static const std::string kArchivalDirName = "archive";
|
|
|
|
|
|
|
|
// Given a path, flatten the path name by replacing all chars not in
|
|
|
|
// {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end.
|
|
|
|
// Return the number of chars stored in dest not including the trailing '\0'.
|
|
|
|
static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) {
|
|
|
|
const char suffix[] = "_LOG";
|
|
|
|
|
|
|
|
size_t write_idx = 0;
|
|
|
|
size_t i = 0;
|
|
|
|
size_t src_len = path.size();
|
|
|
|
|
|
|
|
while (i < src_len && write_idx < len - sizeof(suffix)) {
|
|
|
|
if ((path[i] >= 'a' && path[i] <= 'z') ||
|
|
|
|
(path[i] >= '0' && path[i] <= '9') ||
|
|
|
|
(path[i] >= 'A' && path[i] <= 'Z') ||
|
|
|
|
path[i] == '-' ||
|
|
|
|
path[i] == '.' ||
|
|
|
|
path[i] == '_'){
|
|
|
|
dest[write_idx++] = path[i];
|
|
|
|
} else {
|
|
|
|
if (i > 0) {
|
|
|
|
dest[write_idx++] = '_';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
assert(sizeof(suffix) <= len - write_idx);
|
|
|
|
// "\0" is automatically added by snprintf
|
|
|
|
snprintf(dest + write_idx, len - write_idx, suffix);
|
|
|
|
write_idx += sizeof(suffix) - 1;
|
|
|
|
return write_idx;
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::string MakeFileName(uint64_t number, const char* suffix) {
|
|
|
|
char buf[100];
|
|
|
|
snprintf(buf, sizeof(buf), "%06llu.%s",
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
5 years ago
|
|
|
static_cast<unsigned long long>(number), suffix);
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::string MakeFileName(const std::string& name, uint64_t number,
|
|
|
|
const char* suffix) {
|
|
|
|
return name + "/" + MakeFileName(number, suffix);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string LogFileName(const std::string& name, uint64_t number) {
|
|
|
|
assert(number > 0);
|
|
|
|
return MakeFileName(name, number, "log");
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string LogFileName(uint64_t number) {
|
|
|
|
assert(number > 0);
|
|
|
|
return MakeFileName(number, "log");
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string BlobFileName(uint64_t number) {
|
|
|
|
assert(number > 0);
|
|
|
|
return MakeFileName(number, kRocksDBBlobFileExt.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string BlobFileName(const std::string& blobdirname, uint64_t number) {
|
|
|
|
assert(number > 0);
|
|
|
|
return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string BlobFileName(const std::string& dbname, const std::string& blob_dir,
|
|
|
|
uint64_t number) {
|
|
|
|
assert(number > 0);
|
|
|
|
return MakeFileName(dbname + "/" + blob_dir, number,
|
|
|
|
kRocksDBBlobFileExt.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string ArchivalDirectory(const std::string& dir) {
|
|
|
|
return dir + "/" + kArchivalDirName;
|
|
|
|
}
|
|
|
|
std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
|
|
|
|
assert(number > 0);
|
|
|
|
return MakeFileName(name + "/" + kArchivalDirName, number, "log");
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string MakeTableFileName(const std::string& path, uint64_t number) {
|
|
|
|
return MakeFileName(path, number, kRocksDbTFileExt.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string MakeTableFileName(uint64_t number) {
|
|
|
|
return MakeFileName(number, kRocksDbTFileExt.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string Rocks2LevelTableFileName(const std::string& fullname) {
|
|
|
|
assert(fullname.size() > kRocksDbTFileExt.size() + 1);
|
|
|
|
if (fullname.size() <= kRocksDbTFileExt.size() + 1) {
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
return fullname.substr(0, fullname.size() - kRocksDbTFileExt.size()) +
|
|
|
|
kLevelDbTFileExt;
|
|
|
|
}
|
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
uint64_t TableFileNameToNumber(const std::string& name) {
|
|
|
|
uint64_t number = 0;
|
|
|
|
uint64_t base = 1;
|
|
|
|
int pos = static_cast<int>(name.find_last_of('.'));
|
|
|
|
while (--pos >= 0 && name[pos] >= '0' && name[pos] <= '9') {
|
|
|
|
number += (name[pos] - '0') * base;
|
|
|
|
base *= 10;
|
|
|
|
}
|
|
|
|
return number;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string TableFileName(const std::vector<DbPath>& db_paths, uint64_t number,
|
|
|
|
uint32_t path_id) {
|
|
|
|
assert(number > 0);
|
|
|
|
std::string path;
|
|
|
|
if (path_id >= db_paths.size()) {
|
|
|
|
path = db_paths.back().path;
|
|
|
|
} else {
|
|
|
|
path = db_paths[path_id].path;
|
|
|
|
}
|
|
|
|
return MakeTableFileName(path, number);
|
|
|
|
}
|
|
|
|
|
|
|
|
void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
|
|
|
|
size_t out_buf_size) {
|
|
|
|
if (path_id == 0) {
|
|
|
|
snprintf(out_buf, out_buf_size, "%" PRIu64, number);
|
|
|
|
} else {
|
|
|
|
snprintf(out_buf, out_buf_size, "%" PRIu64
|
|
|
|
"(path "
|
|
|
|
"%" PRIu32 ")",
|
|
|
|
number, path_id);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string DescriptorFileName(uint64_t number) {
|
|
|
|
assert(number > 0);
|
|
|
|
char buf[100];
|
|
|
|
snprintf(buf, sizeof(buf), "MANIFEST-%06llu",
|
|
|
|
static_cast<unsigned long long>(number));
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
|
|
|
|
return dbname + "/" + DescriptorFileName(number);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string CurrentFileName(const std::string& dbname) {
|
|
|
|
return dbname + "/" + kCurrentFileName;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string LockFileName(const std::string& dbname) {
|
|
|
|
return dbname + "/LOCK";
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string TempFileName(const std::string& dbname, uint64_t number) {
|
|
|
|
return MakeFileName(dbname, number, kTempFileNameSuffix.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
InfoLogPrefix::InfoLogPrefix(bool has_log_dir,
|
|
|
|
const std::string& db_absolute_path) {
|
|
|
|
if (!has_log_dir) {
|
|
|
|
const char kInfoLogPrefix[] = "LOG";
|
|
|
|
// "\0" is automatically added to the end
|
|
|
|
snprintf(buf, sizeof(buf), kInfoLogPrefix);
|
|
|
|
prefix = Slice(buf, sizeof(kInfoLogPrefix) - 1);
|
|
|
|
} else {
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
4 years ago
|
|
|
size_t len =
|
|
|
|
GetInfoLogPrefix(NormalizePath(db_absolute_path), buf, sizeof(buf));
|
|
|
|
prefix = Slice(buf, len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string InfoLogFileName(const std::string& dbname,
|
|
|
|
const std::string& db_path, const std::string& log_dir) {
|
|
|
|
if (log_dir.empty()) {
|
|
|
|
return dbname + "/LOG";
|
|
|
|
}
|
|
|
|
|
|
|
|
InfoLogPrefix info_log_prefix(true, db_path);
|
|
|
|
return log_dir + "/" + info_log_prefix.buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the name of the old info log file for "dbname".
|
|
|
|
std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
|
|
|
|
const std::string& db_path, const std::string& log_dir) {
|
|
|
|
char buf[50];
|
|
|
|
snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
|
|
|
|
|
|
|
|
if (log_dir.empty()) {
|
|
|
|
return dbname + "/LOG.old." + buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
InfoLogPrefix info_log_prefix(true, db_path);
|
|
|
|
return log_dir + "/" + info_log_prefix.buf + ".old." + buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string OptionsFileName(uint64_t file_num) {
|
|
|
|
char buffer[256];
|
|
|
|
snprintf(buffer, sizeof(buffer), "%s%06" PRIu64,
|
|
|
|
kOptionsFileNamePrefix.c_str(), file_num);
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
std::string OptionsFileName(const std::string& dbname, uint64_t file_num) {
|
|
|
|
return dbname + "/" + OptionsFileName(file_num);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) {
|
|
|
|
char buffer[256];
|
|
|
|
snprintf(buffer, sizeof(buffer), "%s%06" PRIu64 ".%s",
|
|
|
|
kOptionsFileNamePrefix.c_str(), file_num,
|
|
|
|
kTempFileNameSuffix.c_str());
|
|
|
|
return dbname + "/" + buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
|
|
|
|
char buf[100];
|
|
|
|
snprintf(buf, sizeof(buf), "/METADB-%llu",
|
|
|
|
static_cast<unsigned long long>(number));
|
|
|
|
return dbname + buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string IdentityFileName(const std::string& dbname) {
|
|
|
|
return dbname + "/IDENTITY";
|
|
|
|
}
|
|
|
|
|
|
|
|
// Owned filenames have the form:
|
|
|
|
// dbname/IDENTITY
|
|
|
|
// dbname/CURRENT
|
|
|
|
// dbname/LOCK
|
|
|
|
// dbname/<info_log_name_prefix>
|
|
|
|
// dbname/<info_log_name_prefix>.old.[0-9]+
|
|
|
|
// dbname/MANIFEST-[0-9]+
|
|
|
|
// dbname/[0-9]+.(log|sst|blob)
|
|
|
|
// dbname/METADB-[0-9]+
|
|
|
|
// dbname/OPTIONS-[0-9]+
|
|
|
|
// dbname/OPTIONS-[0-9]+.dbtmp
|
|
|
|
// Disregards / at the beginning
|
|
|
|
bool ParseFileName(const std::string& fname,
|
|
|
|
uint64_t* number,
|
|
|
|
FileType* type,
|
|
|
|
WalFileType* log_type) {
|
|
|
|
return ParseFileName(fname, number, "", type, log_type);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ParseFileName(const std::string& fname, uint64_t* number,
|
|
|
|
const Slice& info_log_name_prefix, FileType* type,
|
|
|
|
WalFileType* log_type) {
|
|
|
|
Slice rest(fname);
|
|
|
|
if (fname.length() > 1 && fname[0] == '/') {
|
|
|
|
rest.remove_prefix(1);
|
|
|
|
}
|
|
|
|
if (rest == "IDENTITY") {
|
|
|
|
*number = 0;
|
|
|
|
*type = kIdentityFile;
|
|
|
|
} else if (rest == "CURRENT") {
|
|
|
|
*number = 0;
|
|
|
|
*type = kCurrentFile;
|
|
|
|
} else if (rest == "LOCK") {
|
|
|
|
*number = 0;
|
|
|
|
*type = kDBLockFile;
|
|
|
|
} else if (info_log_name_prefix.size() > 0 &&
|
|
|
|
rest.starts_with(info_log_name_prefix)) {
|
|
|
|
rest.remove_prefix(info_log_name_prefix.size());
|
|
|
|
if (rest == "" || rest == ".old") {
|
|
|
|
*number = 0;
|
|
|
|
*type = kInfoLogFile;
|
|
|
|
} else if (rest.starts_with(".old.")) {
|
|
|
|
uint64_t ts_suffix;
|
|
|
|
// sizeof also counts the trailing '\0'.
|
|
|
|
rest.remove_prefix(sizeof(".old.") - 1);
|
|
|
|
if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
*number = ts_suffix;
|
|
|
|
*type = kInfoLogFile;
|
|
|
|
}
|
|
|
|
} else if (rest.starts_with("MANIFEST-")) {
|
|
|
|
rest.remove_prefix(strlen("MANIFEST-"));
|
|
|
|
uint64_t num;
|
|
|
|
if (!ConsumeDecimalNumber(&rest, &num)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (!rest.empty()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
*type = kDescriptorFile;
|
|
|
|
*number = num;
|
|
|
|
} else if (rest.starts_with("METADB-")) {
|
|
|
|
rest.remove_prefix(strlen("METADB-"));
|
|
|
|
uint64_t num;
|
|
|
|
if (!ConsumeDecimalNumber(&rest, &num)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (!rest.empty()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
*type = kMetaDatabase;
|
|
|
|
*number = num;
|
|
|
|
} else if (rest.starts_with(kOptionsFileNamePrefix)) {
|
|
|
|
uint64_t ts_suffix;
|
|
|
|
bool is_temp_file = false;
|
|
|
|
rest.remove_prefix(kOptionsFileNamePrefix.size());
|
|
|
|
const std::string kTempFileNameSuffixWithDot =
|
|
|
|
std::string(".") + kTempFileNameSuffix;
|
|
|
|
if (rest.ends_with(kTempFileNameSuffixWithDot)) {
|
|
|
|
rest.remove_suffix(kTempFileNameSuffixWithDot.size());
|
|
|
|
is_temp_file = true;
|
|
|
|
}
|
|
|
|
if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
*number = ts_suffix;
|
|
|
|
*type = is_temp_file ? kTempFile : kOptionsFile;
|
|
|
|
} else {
|
|
|
|
// Avoid strtoull() to keep filename format independent of the
|
|
|
|
// current locale
|
|
|
|
bool archive_dir_found = false;
|
|
|
|
if (rest.starts_with(kArchivalDirName)) {
|
|
|
|
if (rest.size() <= kArchivalDirName.size()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
rest.remove_prefix(kArchivalDirName.size() +
|
|
|
|
1); // Add 1 to remove / also
|
|
|
|
if (log_type) {
|
|
|
|
*log_type = kArchivedLogFile;
|
|
|
|
}
|
|
|
|
archive_dir_found = true;
|
|
|
|
}
|
|
|
|
uint64_t num;
|
|
|
|
if (!ConsumeDecimalNumber(&rest, &num)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (rest.size() <= 1 || rest[0] != '.') {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
rest.remove_prefix(1);
|
|
|
|
|
|
|
|
Slice suffix = rest;
|
|
|
|
if (suffix == Slice("log")) {
|
|
|
|
*type = kWalFile;
|
|
|
|
if (log_type && !archive_dir_found) {
|
|
|
|
*log_type = kAliveLogFile;
|
|
|
|
}
|
|
|
|
} else if (archive_dir_found) {
|
|
|
|
return false; // Archive dir can contain only log files
|
|
|
|
} else if (suffix == Slice(kRocksDbTFileExt) ||
|
|
|
|
suffix == Slice(kLevelDbTFileExt)) {
|
|
|
|
*type = kTableFile;
|
|
|
|
} else if (suffix == Slice(kRocksDBBlobFileExt)) {
|
|
|
|
*type = kBlobFile;
|
|
|
|
} else if (suffix == Slice(kTempFileNameSuffix)) {
|
|
|
|
*type = kTempFile;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
*number = num;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname,
|
|
|
|
uint64_t descriptor_number,
|
|
|
|
FSDirectory* directory_to_fsync) {
|
|
|
|
// Remove leading "dbname/" and add newline to manifest file name
|
|
|
|
std::string manifest = DescriptorFileName(dbname, descriptor_number);
|
|
|
|
Slice contents = manifest;
|
|
|
|
assert(contents.starts_with(dbname + "/"));
|
|
|
|
contents.remove_prefix(dbname.size() + 1);
|
|
|
|
std::string tmp = TempFileName(dbname, descriptor_number);
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
IOStatus s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true);
|
Handle rename() failure in non-local FS (#8192)
Summary:
In a distributed environment, a file `rename()` operation can succeed on server (remote)
side, but the client can somehow return non-ok status to RocksDB. Possible reasons include
network partition, connection issue, etc. This happens in `rocksdb::SetCurrentFile()`, which
can be called in `LogAndApply() -> ProcessManifestWrites()` if RocksDB tries to switch to a
new MANIFEST. We currently always delete the new MANIFEST if an error occurs.
This is problematic in distributed world. If the server-side successfully updates the CURRENT
file via renaming, then a subsequent `DB::Open()` will try to look for the new MANIFEST and fail.
As a fix, we can track the execution result of IO operations on the new MANIFEST.
- If IO operations on the new MANIFEST fail, then we know the CURRENT must point to the original
MANIFEST. Therefore, it is safe to remove the new MANIFEST.
- If IO operations on the new MANIFEST all succeed, but somehow we end up in the clean up
code block, then we do not know whether CURRENT points to the new or old MANIFEST. (For local
POSIX-compliant FS, it should still point to old MANIFEST, but it does not matter if we keep the
new MANIFEST.) Therefore, we keep the new MANIFEST.
- Any future `LogAndApply()` will switch to a new MANIFEST and update CURRENT.
- If process reopens the db immediately after the failure, then the CURRENT file can point
to either the new MANIFEST or the old one, both of which exist. Therefore, recovery can
succeed and ignore the other.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8192
Test Plan: make check
Reviewed By: zhichao-cao
Differential Revision: D27804648
Pulled By: riversand963
fbshipit-source-id: 9c16f2a5ce41bc6aadf085e48449b19ede8423e4
4 years ago
|
|
|
TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s);
|
|
|
|
if (s.ok()) {
|
|
|
|
TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:0", REDUCE_ODDS2);
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
s = fs->RenameFile(tmp, CurrentFileName(dbname), IOOptions(), nullptr);
|
|
|
|
TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:1", REDUCE_ODDS2);
|
Handle rename() failure in non-local FS (#8192)
Summary:
In a distributed environment, a file `rename()` operation can succeed on server (remote)
side, but the client can somehow return non-ok status to RocksDB. Possible reasons include
network partition, connection issue, etc. This happens in `rocksdb::SetCurrentFile()`, which
can be called in `LogAndApply() -> ProcessManifestWrites()` if RocksDB tries to switch to a
new MANIFEST. We currently always delete the new MANIFEST if an error occurs.
This is problematic in distributed world. If the server-side successfully updates the CURRENT
file via renaming, then a subsequent `DB::Open()` will try to look for the new MANIFEST and fail.
As a fix, we can track the execution result of IO operations on the new MANIFEST.
- If IO operations on the new MANIFEST fail, then we know the CURRENT must point to the original
MANIFEST. Therefore, it is safe to remove the new MANIFEST.
- If IO operations on the new MANIFEST all succeed, but somehow we end up in the clean up
code block, then we do not know whether CURRENT points to the new or old MANIFEST. (For local
POSIX-compliant FS, it should still point to old MANIFEST, but it does not matter if we keep the
new MANIFEST.) Therefore, we keep the new MANIFEST.
- Any future `LogAndApply()` will switch to a new MANIFEST and update CURRENT.
- If process reopens the db immediately after the failure, then the CURRENT file can point
to either the new MANIFEST or the old one, both of which exist. Therefore, recovery can
succeed and ignore the other.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8192
Test Plan: make check
Reviewed By: zhichao-cao
Differential Revision: D27804648
Pulled By: riversand963
fbshipit-source-id: 9c16f2a5ce41bc6aadf085e48449b19ede8423e4
4 years ago
|
|
|
TEST_SYNC_POINT_CALLBACK("SetCurrentFile:AfterRename", &s);
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
if (directory_to_fsync != nullptr) {
|
|
|
|
s = directory_to_fsync->FsyncWithDirOptions(
|
|
|
|
IOOptions(), nullptr, DirFsyncOptions(CurrentFileName(dbname)));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
fs->DeleteFile(tmp, IOOptions(), nullptr)
|
|
|
|
.PermitUncheckedError(); // NOTE: PermitUncheckedError is acceptable
|
|
|
|
// here as we are already handling an error
|
|
|
|
// case, and this is just a best-attempt
|
|
|
|
// effort at some cleanup
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SetIdentityFile(Env* env, const std::string& dbname,
|
|
|
|
const std::string& db_id) {
|
|
|
|
std::string id;
|
|
|
|
if (db_id.empty()) {
|
|
|
|
id = env->GenerateUniqueId();
|
|
|
|
} else {
|
|
|
|
id = db_id;
|
|
|
|
}
|
|
|
|
assert(!id.empty());
|
|
|
|
// Reserve the filename dbname/000000.dbtmp for the temporary identity file
|
|
|
|
std::string tmp = TempFileName(dbname, 0);
|
|
|
|
std::string identify_file_name = IdentityFileName(dbname);
|
|
|
|
Status s = WriteStringToFile(env, id, tmp, true);
|
|
|
|
if (s.ok()) {
|
|
|
|
s = env->RenameFile(tmp, identify_file_name);
|
|
|
|
}
|
|
|
|
std::unique_ptr<FSDirectory> dir_obj;
|
|
|
|
if (s.ok()) {
|
|
|
|
s = env->GetFileSystem()->NewDirectory(dbname, IOOptions(), &dir_obj,
|
|
|
|
nullptr);
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr,
|
|
|
|
DirFsyncOptions(identify_file_name));
|
|
|
|
}
|
|
|
|
|
|
|
|
// The default Close() could return "NotSupported" and we bypass it
|
|
|
|
// if it is not impelmented. Detailed explanations can be found in
|
|
|
|
// db/db_impl/db_impl.h
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
3 years ago
|
|
|
if (s.ok()) {
|
|
|
|
Status temp_s = dir_obj->Close(IOOptions(), nullptr);
|
|
|
|
if (!temp_s.ok()) {
|
|
|
|
if (temp_s.IsNotSupported()) {
|
|
|
|
temp_s.PermitUncheckedError();
|
|
|
|
} else {
|
|
|
|
s = temp_s;
|
|
|
|
}
|
|
|
|
}
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
3 years ago
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
env->DeleteFile(tmp).PermitUncheckedError();
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
IOStatus SyncManifest(const ImmutableDBOptions* db_options,
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
WritableFileWriter* file) {
|
|
|
|
TEST_KILL_RANDOM_WITH_WEIGHT("SyncManifest:0", REDUCE_ODDS2);
|
|
|
|
StopWatch sw(db_options->clock, db_options->stats, MANIFEST_FILE_SYNC_MICROS);
|
|
|
|
return file->Sync(db_options->use_fsync);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status GetInfoLogFiles(const std::shared_ptr<FileSystem>& fs,
|
|
|
|
const std::string& db_log_dir, const std::string& dbname,
|
|
|
|
std::string* parent_dir,
|
|
|
|
std::vector<std::string>* info_log_list) {
|
|
|
|
assert(parent_dir != nullptr);
|
|
|
|
assert(info_log_list != nullptr);
|
|
|
|
uint64_t number = 0;
|
|
|
|
FileType type = kWalFile;
|
|
|
|
|
|
|
|
if (!db_log_dir.empty()) {
|
|
|
|
*parent_dir = db_log_dir;
|
|
|
|
} else {
|
|
|
|
*parent_dir = dbname;
|
|
|
|
}
|
|
|
|
|
|
|
|
InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname);
|
|
|
|
|
|
|
|
std::vector<std::string> file_names;
|
|
|
|
Status s = fs->GetChildren(*parent_dir, IOOptions(), &file_names, nullptr);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto& f : file_names) {
|
|
|
|
if (ParseFileName(f, &number, info_log_prefix.prefix, &type) &&
|
|
|
|
(type == kInfoLogFile)) {
|
|
|
|
info_log_list->push_back(f);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string NormalizePath(const std::string& path) {
|
|
|
|
std::string dst;
|
|
|
|
|
|
|
|
if (path.length() > 2 && path[0] == kFilePathSeparator &&
|
|
|
|
path[1] == kFilePathSeparator) { // Handle UNC names
|
|
|
|
dst.append(2, kFilePathSeparator);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto c : path) {
|
Fix MSVC-related build issues (#7439)
Summary:
This PR addresses some build and functional issues on MSVC targets, as a step towards an eventual goal of having RocksDB build successfully for Windows on ARM64.
Addressed issues include:
- BitsSetToOne and CountTrailingZeroBits do not compile on non-x64 MSVC targets. A fallback implementation of BitsSetToOne when Intel intrinsics are not available is added, based on the C++20 `<bit>` popcount implementation in Microsoft's STL.
- The implementation of FloorLog2 for MSVC targets (including x64) gives incorrect results. The unit test easily detects this, but CircleCI is currently configured to only run a specific set of tests for Windows CMake builds, so this seems to have been unnoticed.
- AsmVolatilePause does not use YieldProcessor on Windows ARM64 targets, even though it is available.
- When CondVar::TimedWait calls Microsoft STL's condition_variable::wait_for, it can potentially trigger a bug (just recently fixed in the upcoming VS 16.8's STL) that deadlocks various tests that wait for a timer to execute, since `Timer::Run` doesn't get a chance to execute before being blocked by the test function acquiring the mutex.
- In c_test, `GetTempDir` assumes a POSIX-style temp path.
- `NormalizePath` did not eliminate consecutive POSIX-style path separators on Windows, resulting in test failures in e.g., wal_manager_test.
- Various other test failures.
In a followup PR I hope to modify CircleCI's config.yml to invoke all RocksDB unit tests in Windows CMake builds with CTest, instead of the current use of `run_ci_db_test.ps1` which requires individual tests to be specified and is missing many of the existing tests.
Notes from peterd: FloorLog2 is not yet used in production code (it's for something in progress). I also added a few more inexpensive platform-dependent tests to Windows CircleCI runs. And included facebook/folly#1461 as requested
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7439
Reviewed By: jay-zhuang
Differential Revision: D24021563
Pulled By: pdillinger
fbshipit-source-id: 0ec2027c0d6a494d8a0fe38d9667fc2f7e29f7e7
4 years ago
|
|
|
if (!dst.empty() && (c == kFilePathSeparator || c == '/') &&
|
|
|
|
(dst.back() == kFilePathSeparator || dst.back() == '/')) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
dst.push_back(c);
|
|
|
|
}
|
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|