|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/version_edit.h"
|
|
|
|
|
|
|
|
#include "db/blob/blob_index.h"
|
|
|
|
#include "db/version_set.h"
|
|
|
|
#include "logging/event_logger.h"
|
|
|
|
#include "rocksdb/slice.h"
|
|
|
|
#include "test_util/sync_point.h"
|
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/string_util.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
} // anonymous namespace
|
|
|
|
|
|
|
|
uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
|
|
|
|
assert(number <= kFileNumberMask);
|
|
|
|
return number | (path_id * (kFileNumberMask + 1));
|
|
|
|
}
|
|
|
|
|
|
|
|
void FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
|
|
|
|
SequenceNumber seqno,
|
|
|
|
ValueType value_type) {
|
|
|
|
if (smallest.size() == 0) {
|
|
|
|
smallest.DecodeFrom(key);
|
|
|
|
}
|
|
|
|
largest.DecodeFrom(key);
|
|
|
|
fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
|
|
|
|
fd.largest_seqno = std::max(fd.largest_seqno, seqno);
|
|
|
|
|
|
|
|
if (value_type == kTypeBlobIndex) {
|
|
|
|
BlobIndex blob_index;
|
|
|
|
const Status s = blob_index.DecodeFrom(value);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (blob_index.IsInlined()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (blob_index.HasTTL()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Paranoid check: this should not happen because BlobDB numbers the blob
|
|
|
|
// files starting from 1.
|
|
|
|
if (blob_index.file_number() == kInvalidBlobFileNumber) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (oldest_blob_file_number == kInvalidBlobFileNumber ||
|
|
|
|
oldest_blob_file_number > blob_index.file_number()) {
|
|
|
|
oldest_blob_file_number = blob_index.file_number();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void VersionEdit::Clear() {
|
|
|
|
max_level_ = 0;
|
|
|
|
db_id_.clear();
|
|
|
|
comparator_.clear();
|
|
|
|
log_number_ = 0;
|
|
|
|
prev_log_number_ = 0;
|
|
|
|
next_file_number_ = 0;
|
|
|
|
max_column_family_ = 0;
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
min_log_number_to_keep_ = 0;
|
|
|
|
last_sequence_ = 0;
|
|
|
|
has_db_id_ = false;
|
|
|
|
has_comparator_ = false;
|
|
|
|
has_log_number_ = false;
|
|
|
|
has_prev_log_number_ = false;
|
|
|
|
has_next_file_number_ = false;
|
|
|
|
has_max_column_family_ = false;
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
has_min_log_number_to_keep_ = false;
|
|
|
|
has_last_sequence_ = false;
|
|
|
|
deleted_files_.clear();
|
|
|
|
new_files_.clear();
|
|
|
|
blob_file_additions_.clear();
|
|
|
|
blob_file_garbages_.clear();
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
wal_additions_.clear();
|
|
|
|
wal_deletion_.Reset();
|
|
|
|
column_family_ = 0;
|
|
|
|
is_column_family_add_ = false;
|
|
|
|
is_column_family_drop_ = false;
|
|
|
|
column_family_name_.clear();
|
|
|
|
is_in_atomic_group_ = false;
|
|
|
|
remaining_entries_ = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool VersionEdit::EncodeTo(std::string* dst) const {
|
|
|
|
if (has_db_id_) {
|
|
|
|
PutVarint32(dst, kDbId);
|
|
|
|
PutLengthPrefixedSlice(dst, db_id_);
|
|
|
|
}
|
|
|
|
if (has_comparator_) {
|
|
|
|
PutVarint32(dst, kComparator);
|
|
|
|
PutLengthPrefixedSlice(dst, comparator_);
|
|
|
|
}
|
|
|
|
if (has_log_number_) {
|
|
|
|
PutVarint32Varint64(dst, kLogNumber, log_number_);
|
|
|
|
}
|
|
|
|
if (has_prev_log_number_) {
|
|
|
|
PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_);
|
|
|
|
}
|
|
|
|
if (has_next_file_number_) {
|
|
|
|
PutVarint32Varint64(dst, kNextFileNumber, next_file_number_);
|
|
|
|
}
|
|
|
|
if (has_max_column_family_) {
|
|
|
|
PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_);
|
|
|
|
}
|
|
|
|
if (has_last_sequence_) {
|
|
|
|
PutVarint32Varint64(dst, kLastSequence, last_sequence_);
|
|
|
|
}
|
|
|
|
for (const auto& deleted : deleted_files_) {
|
|
|
|
PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */,
|
|
|
|
deleted.second /* file number */);
|
|
|
|
}
|
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
bool min_log_num_written = false;
|
|
|
|
for (size_t i = 0; i < new_files_.size(); i++) {
|
|
|
|
const FileMetaData& f = new_files_[i].second;
|
|
|
|
if (!f.smallest.Valid() || !f.largest.Valid()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
PutVarint32(dst, kNewFile4);
|
|
|
|
PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber());
|
|
|
|
PutVarint64(dst, f.fd.GetFileSize());
|
|
|
|
PutLengthPrefixedSlice(dst, f.smallest.Encode());
|
|
|
|
PutLengthPrefixedSlice(dst, f.largest.Encode());
|
|
|
|
PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
|
|
|
|
// Customized fields' format:
|
|
|
|
// +-----------------------------+
|
|
|
|
// | 1st field's tag (varint32) |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | 1st field's size (varint32) |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | bytes for 1st field |
|
|
|
|
// | (based on size decoded) |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | |
|
|
|
|
// | ...... |
|
|
|
|
// | |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | last field's size (varint32)|
|
|
|
|
// +-----------------------------+
|
|
|
|
// | bytes for last field |
|
|
|
|
// | (based on size decoded) |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | terminating tag (varint32) |
|
|
|
|
// +-----------------------------+
|
|
|
|
//
|
|
|
|
// Customized encoding for fields:
|
|
|
|
// tag kPathId: 1 byte as path_id
|
|
|
|
// tag kNeedCompaction:
|
|
|
|
// now only can take one char value 1 indicating need-compaction
|
|
|
|
//
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime);
|
|
|
|
std::string varint_oldest_ancester_time;
|
|
|
|
PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
|
|
|
|
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
|
|
|
|
&varint_oldest_ancester_time);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
|
|
|
|
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kFileCreationTime);
|
|
|
|
std::string varint_file_creation_time;
|
|
|
|
PutVarint64(&varint_file_creation_time, f.file_creation_time);
|
|
|
|
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
|
|
|
|
&varint_file_creation_time);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
|
|
|
|
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kFileChecksum);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
|
|
|
|
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name));
|
|
|
|
|
|
|
|
if (f.fd.GetPathId() != 0) {
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kPathId);
|
|
|
|
char p = static_cast<char>(f.fd.GetPathId());
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(&p, 1));
|
|
|
|
}
|
|
|
|
if (f.marked_for_compaction) {
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kNeedCompaction);
|
|
|
|
char p = static_cast<char>(1);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(&p, 1));
|
|
|
|
}
|
|
|
|
if (has_min_log_number_to_keep_ && !min_log_num_written) {
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack);
|
|
|
|
std::string varint_log_number;
|
|
|
|
PutFixed64(&varint_log_number, min_log_number_to_keep_);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(varint_log_number));
|
|
|
|
min_log_num_written = true;
|
|
|
|
}
|
|
|
|
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber);
|
|
|
|
std::string oldest_blob_file_number;
|
|
|
|
PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
|
|
|
|
}
|
|
|
|
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
|
|
|
|
dst);
|
|
|
|
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kTerminate);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto& blob_file_addition : blob_file_additions_) {
|
|
|
|
PutVarint32(dst, kBlobFileAddition);
|
|
|
|
blob_file_addition.EncodeTo(dst);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto& blob_file_garbage : blob_file_garbages_) {
|
|
|
|
PutVarint32(dst, kBlobFileGarbage);
|
|
|
|
blob_file_garbage.EncodeTo(dst);
|
|
|
|
}
|
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
for (const auto& wal_addition : wal_additions_) {
|
|
|
|
PutVarint32(dst, kWalAddition);
|
|
|
|
wal_addition.EncodeTo(dst);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!wal_deletion_.IsEmpty()) {
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
PutVarint32(dst, kWalDeletion);
|
|
|
|
wal_deletion_.EncodeTo(dst);
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// 0 is default and does not need to be explicitly written
|
|
|
|
if (column_family_ != 0) {
|
|
|
|
PutVarint32Varint32(dst, kColumnFamily, column_family_);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_column_family_add_) {
|
|
|
|
PutVarint32(dst, kColumnFamilyAdd);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(column_family_name_));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_column_family_drop_) {
|
|
|
|
PutVarint32(dst, kColumnFamilyDrop);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_in_atomic_group_) {
|
|
|
|
PutVarint32(dst, kInAtomicGroup);
|
|
|
|
PutVarint32(dst, remaining_entries_);
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool GetInternalKey(Slice* input, InternalKey* dst) {
|
|
|
|
Slice str;
|
|
|
|
if (GetLengthPrefixedSlice(input, &str)) {
|
|
|
|
dst->DecodeFrom(str);
|
[fix] SIGSEGV when VersionEdit in MANIFEST is corrupted
Summary:
This was reported by our customers in task #4295529.
Cause:
* MANIFEST file contains a VersionEdit, which contains file entries whose 'smallest' and 'largest' internal keys are empty. String with zero characters. Root cause of corruption was not investigated. We should report corruption when this happens. However, we currently SIGSEGV.
Here's what happens:
* VersionEdit encodes zero-strings happily and stores them in smallest and largest InternalKeys. InternalKey::Encode() does assert when `rep_.empty()`, but we don't assert in production environemnts. Also, we should never assert as a result of DB corruption.
* As part of our ConsistencyCheck, we call GetLiveFilesMetaData()
* GetLiveFilesMetadata() calls `file->largest.user_key().ToString()`
* user_key() function does: 1. assert(size > 8) (ooops, no assert), 2. returns `Slice(internal_key.data(), internal_key.size() - 8)`
* since `internal_key.size()` is unsigned int, this call translates to `Slice(whatever, 1298471928561892576182756)`. Bazinga.
Fix:
* VersionEdit checks if InternalKey is valid in `VersionEdit::GetInternalKey()`. If it's invalid, returns corruption.
Lessons learned:
* Always keep in mind that even if you `assert()`, production code will continue execution even if assert fails.
* Never `assert` based on DB corruption. Assert only if the code should guarantee that assert can't fail.
Test Plan: dumped offending manifest. Before: assert. Now: corruption
Reviewers: dhruba, haobo, sdong
Reviewed By: dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D18507
11 years ago
|
|
|
return dst->Valid();
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
|
|
|
|
uint32_t v = 0;
|
|
|
|
if (GetVarint32(input, &v)) {
|
|
|
|
*level = v;
|
|
|
|
if (max_level_ < *level) {
|
|
|
|
max_level_ = *level;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const char* VersionEdit::DecodeNewFile4From(Slice* input) {
|
|
|
|
const char* msg = nullptr;
|
|
|
|
int level = 0;
|
|
|
|
FileMetaData f;
|
|
|
|
uint64_t number = 0;
|
|
|
|
uint32_t path_id = 0;
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
SequenceNumber smallest_seqno = 0;
|
|
|
|
SequenceNumber largest_seqno = kMaxSequenceNumber;
|
|
|
|
if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
|
|
|
|
GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
|
|
|
|
GetInternalKey(input, &f.largest) &&
|
|
|
|
GetVarint64(input, &smallest_seqno) &&
|
|
|
|
GetVarint64(input, &largest_seqno)) {
|
|
|
|
// See comments in VersionEdit::EncodeTo() for format of customized fields
|
|
|
|
while (true) {
|
|
|
|
uint32_t custom_tag = 0;
|
|
|
|
Slice field;
|
|
|
|
if (!GetVarint32(input, &custom_tag)) {
|
|
|
|
return "new-file4 custom field";
|
|
|
|
}
|
|
|
|
if (custom_tag == kTerminate) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!GetLengthPrefixedSlice(input, &field)) {
|
|
|
|
return "new-file4 custom field length prefixed slice error";
|
|
|
|
}
|
|
|
|
switch (custom_tag) {
|
|
|
|
case kPathId:
|
|
|
|
if (field.size() != 1) {
|
|
|
|
return "path_id field wrong size";
|
|
|
|
}
|
|
|
|
path_id = field[0];
|
|
|
|
if (path_id > 3) {
|
|
|
|
return "path_id wrong vaue";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kOldestAncesterTime:
|
|
|
|
if (!GetVarint64(&field, &f.oldest_ancester_time)) {
|
|
|
|
return "invalid oldest ancester time";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kFileCreationTime:
|
|
|
|
if (!GetVarint64(&field, &f.file_creation_time)) {
|
|
|
|
return "invalid file creation time";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kFileChecksum:
|
|
|
|
f.file_checksum = field.ToString();
|
|
|
|
break;
|
|
|
|
case kFileChecksumFuncName:
|
|
|
|
f.file_checksum_func_name = field.ToString();
|
|
|
|
break;
|
|
|
|
case kNeedCompaction:
|
|
|
|
if (field.size() != 1) {
|
|
|
|
return "need_compaction field wrong size";
|
|
|
|
}
|
|
|
|
f.marked_for_compaction = (field[0] == 1);
|
|
|
|
break;
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
case kMinLogNumberToKeepHack:
|
|
|
|
// This is a hack to encode kMinLogNumberToKeep in a
|
|
|
|
// forward-compatible fashion.
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
if (!GetFixed64(&field, &min_log_number_to_keep_)) {
|
|
|
|
return "deleted log number malformatted";
|
|
|
|
}
|
|
|
|
has_min_log_number_to_keep_ = true;
|
|
|
|
break;
|
|
|
|
case kOldestBlobFileNumber:
|
|
|
|
if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
|
|
|
|
return "invalid oldest blob file number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
|
|
|
|
// Should not proceed if cannot understand it
|
|
|
|
return "new-file4 custom field not supported";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return "new-file4 entry";
|
|
|
|
}
|
|
|
|
f.fd =
|
|
|
|
FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno);
|
|
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status VersionEdit::DecodeFrom(const Slice& src) {
|
|
|
|
Clear();
|
|
|
|
Slice input = src;
|
|
|
|
const char* msg = nullptr;
|
|
|
|
uint32_t tag = 0;
|
|
|
|
|
|
|
|
// Temporary storage for parsing
|
|
|
|
int level = 0;
|
|
|
|
FileMetaData f;
|
|
|
|
Slice str;
|
|
|
|
InternalKey key;
|
|
|
|
while (msg == nullptr && GetVarint32(&input, &tag)) {
|
|
|
|
switch (tag) {
|
|
|
|
case kDbId:
|
|
|
|
if (GetLengthPrefixedSlice(&input, &str)) {
|
|
|
|
db_id_ = str.ToString();
|
|
|
|
has_db_id_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "db id";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kComparator:
|
|
|
|
if (GetLengthPrefixedSlice(&input, &str)) {
|
|
|
|
comparator_ = str.ToString();
|
|
|
|
has_comparator_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "comparator name";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kLogNumber:
|
|
|
|
if (GetVarint64(&input, &log_number_)) {
|
|
|
|
has_log_number_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "log number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kPrevLogNumber:
|
|
|
|
if (GetVarint64(&input, &prev_log_number_)) {
|
|
|
|
has_prev_log_number_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "previous log number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kNextFileNumber:
|
|
|
|
if (GetVarint64(&input, &next_file_number_)) {
|
|
|
|
has_next_file_number_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "next file number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kMaxColumnFamily:
|
|
|
|
if (GetVarint32(&input, &max_column_family_)) {
|
|
|
|
has_max_column_family_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "max column family";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
case kMinLogNumberToKeep:
|
|
|
|
if (GetVarint64(&input, &min_log_number_to_keep_)) {
|
|
|
|
has_min_log_number_to_keep_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "min log number to kee";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kLastSequence:
|
|
|
|
if (GetVarint64(&input, &last_sequence_)) {
|
|
|
|
has_last_sequence_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "last sequence number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kCompactPointer:
|
|
|
|
if (GetLevel(&input, &level, &msg) &&
|
|
|
|
GetInternalKey(&input, &key)) {
|
|
|
|
// we don't use compact pointers anymore,
|
|
|
|
// but we should not fail if they are still
|
|
|
|
// in manifest
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "compaction pointer";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kDeletedFile: {
|
|
|
|
uint64_t number = 0;
|
|
|
|
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
|
|
|
|
deleted_files_.insert(std::make_pair(level, number));
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "deleted file";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kNewFile: {
|
|
|
|
uint64_t number = 0;
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
|
|
|
|
GetVarint64(&input, &file_size) &&
|
|
|
|
GetInternalKey(&input, &f.smallest) &&
|
|
|
|
GetInternalKey(&input, &f.largest)) {
|
|
|
|
f.fd = FileDescriptor(number, 0, file_size);
|
|
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "new-file entry";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kNewFile2: {
|
|
|
|
uint64_t number = 0;
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
SequenceNumber smallest_seqno = 0;
|
|
|
|
SequenceNumber largest_seqno = kMaxSequenceNumber;
|
|
|
|
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
|
|
|
|
GetVarint64(&input, &file_size) &&
|
|
|
|
GetInternalKey(&input, &f.smallest) &&
|
|
|
|
GetInternalKey(&input, &f.largest) &&
|
|
|
|
GetVarint64(&input, &smallest_seqno) &&
|
|
|
|
GetVarint64(&input, &largest_seqno)) {
|
|
|
|
f.fd = FileDescriptor(number, 0, file_size, smallest_seqno,
|
|
|
|
largest_seqno);
|
|
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "new-file2 entry";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kNewFile3: {
|
|
|
|
uint64_t number = 0;
|
|
|
|
uint32_t path_id = 0;
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
SequenceNumber smallest_seqno = 0;
|
|
|
|
SequenceNumber largest_seqno = kMaxSequenceNumber;
|
|
|
|
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
|
|
|
|
GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
|
|
|
|
GetInternalKey(&input, &f.smallest) &&
|
|
|
|
GetInternalKey(&input, &f.largest) &&
|
|
|
|
GetVarint64(&input, &smallest_seqno) &&
|
|
|
|
GetVarint64(&input, &largest_seqno)) {
|
|
|
|
f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno,
|
|
|
|
largest_seqno);
|
|
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "new-file3 entry";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kNewFile4: {
|
|
|
|
msg = DecodeNewFile4From(&input);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kBlobFileAddition: {
|
|
|
|
BlobFileAddition blob_file_addition;
|
|
|
|
const Status s = blob_file_addition.DecodeFrom(&input);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
AddBlobFile(std::move(blob_file_addition));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kBlobFileGarbage: {
|
|
|
|
BlobFileGarbage blob_file_garbage;
|
|
|
|
const Status s = blob_file_garbage.DecodeFrom(&input);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
AddBlobFileGarbage(std::move(blob_file_garbage));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
case kWalAddition: {
|
|
|
|
WalAddition wal_addition;
|
|
|
|
const Status s = wal_addition.DecodeFrom(&input);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
wal_additions_.emplace_back(std::move(wal_addition));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kWalDeletion: {
|
|
|
|
WalDeletion wal_deletion;
|
|
|
|
const Status s = wal_deletion.DecodeFrom(&input);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
wal_deletion_ = std::move(wal_deletion);
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kColumnFamily:
|
|
|
|
if (!GetVarint32(&input, &column_family_)) {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "set column family id";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kColumnFamilyAdd:
|
|
|
|
if (GetLengthPrefixedSlice(&input, &str)) {
|
|
|
|
is_column_family_add_ = true;
|
|
|
|
column_family_name_ = str.ToString();
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "column family add";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kColumnFamilyDrop:
|
|
|
|
is_column_family_drop_ = true;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kInAtomicGroup:
|
|
|
|
is_in_atomic_group_ = true;
|
|
|
|
if (!GetVarint32(&input, &remaining_entries_)) {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "remaining entries";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
if (tag & kTagSafeIgnoreMask) {
|
|
|
|
// Tag from future which can be safely ignored.
|
|
|
|
// The next field must be the length of the entry.
|
|
|
|
uint32_t field_len;
|
|
|
|
if (!GetVarint32(&input, &field_len) ||
|
|
|
|
static_cast<size_t>(field_len) > input.size()) {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "safely ignoreable tag length error";
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
input.remove_prefix(static_cast<size_t>(field_len));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
msg = "unknown tag";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (msg == nullptr && !input.empty()) {
|
|
|
|
msg = "invalid tag";
|
|
|
|
}
|
|
|
|
|
|
|
|
Status result;
|
|
|
|
if (msg != nullptr) {
|
|
|
|
result = Status::Corruption("VersionEdit", msg);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string VersionEdit::DebugString(bool hex_key) const {
|
|
|
|
std::string r;
|
|
|
|
r.append("VersionEdit {");
|
|
|
|
if (has_db_id_) {
|
|
|
|
r.append("\n DB ID: ");
|
|
|
|
r.append(db_id_);
|
|
|
|
}
|
|
|
|
if (has_comparator_) {
|
|
|
|
r.append("\n Comparator: ");
|
|
|
|
r.append(comparator_);
|
|
|
|
}
|
|
|
|
if (has_log_number_) {
|
|
|
|
r.append("\n LogNumber: ");
|
|
|
|
AppendNumberTo(&r, log_number_);
|
|
|
|
}
|
|
|
|
if (has_prev_log_number_) {
|
|
|
|
r.append("\n PrevLogNumber: ");
|
|
|
|
AppendNumberTo(&r, prev_log_number_);
|
|
|
|
}
|
|
|
|
if (has_next_file_number_) {
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
9 years ago
|
|
|
r.append("\n NextFileNumber: ");
|
|
|
|
AppendNumberTo(&r, next_file_number_);
|
|
|
|
}
|
|
|
|
if (has_max_column_family_) {
|
|
|
|
r.append("\n MaxColumnFamily: ");
|
|
|
|
AppendNumberTo(&r, max_column_family_);
|
|
|
|
}
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
if (has_min_log_number_to_keep_) {
|
|
|
|
r.append("\n MinLogNumberToKeep: ");
|
|
|
|
AppendNumberTo(&r, min_log_number_to_keep_);
|
|
|
|
}
|
|
|
|
if (has_last_sequence_) {
|
|
|
|
r.append("\n LastSeq: ");
|
|
|
|
AppendNumberTo(&r, last_sequence_);
|
|
|
|
}
|
|
|
|
for (const auto& deleted_file : deleted_files_) {
|
|
|
|
r.append("\n DeleteFile: ");
|
|
|
|
AppendNumberTo(&r, deleted_file.first);
|
|
|
|
r.append(" ");
|
|
|
|
AppendNumberTo(&r, deleted_file.second);
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i < new_files_.size(); i++) {
|
|
|
|
const FileMetaData& f = new_files_[i].second;
|
|
|
|
r.append("\n AddFile: ");
|
|
|
|
AppendNumberTo(&r, new_files_[i].first);
|
|
|
|
r.append(" ");
|
|
|
|
AppendNumberTo(&r, f.fd.GetNumber());
|
|
|
|
r.append(" ");
|
|
|
|
AppendNumberTo(&r, f.fd.GetFileSize());
|
|
|
|
r.append(" ");
|
|
|
|
r.append(f.smallest.DebugString(hex_key));
|
|
|
|
r.append(" .. ");
|
|
|
|
r.append(f.largest.DebugString(hex_key));
|
|
|
|
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
|
|
|
|
r.append(" blob_file:");
|
|
|
|
AppendNumberTo(&r, f.oldest_blob_file_number);
|
|
|
|
}
|
|
|
|
r.append(" oldest_ancester_time:");
|
|
|
|
AppendNumberTo(&r, f.oldest_ancester_time);
|
|
|
|
r.append(" file_creation_time:");
|
|
|
|
AppendNumberTo(&r, f.file_creation_time);
|
|
|
|
r.append(" file_checksum:");
|
|
|
|
r.append(f.file_checksum);
|
|
|
|
r.append(" file_checksum_func_name: ");
|
|
|
|
r.append(f.file_checksum_func_name);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto& blob_file_addition : blob_file_additions_) {
|
|
|
|
r.append("\n BlobFileAddition: ");
|
|
|
|
r.append(blob_file_addition.DebugString());
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto& blob_file_garbage : blob_file_garbages_) {
|
|
|
|
r.append("\n BlobFileGarbage: ");
|
|
|
|
r.append(blob_file_garbage.DebugString());
|
|
|
|
}
|
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
for (const auto& wal_addition : wal_additions_) {
|
|
|
|
r.append("\n WalAddition: ");
|
|
|
|
r.append(wal_addition.DebugString());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!wal_deletion_.IsEmpty()) {
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
r.append("\n WalDeletion: ");
|
|
|
|
r.append(wal_deletion_.DebugString());
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
}
|
|
|
|
|
|
|
|
r.append("\n ColumnFamily: ");
|
|
|
|
AppendNumberTo(&r, column_family_);
|
|
|
|
if (is_column_family_add_) {
|
|
|
|
r.append("\n ColumnFamilyAdd: ");
|
|
|
|
r.append(column_family_name_);
|
|
|
|
}
|
|
|
|
if (is_column_family_drop_) {
|
|
|
|
r.append("\n ColumnFamilyDrop");
|
|
|
|
}
|
|
|
|
if (is_in_atomic_group_) {
|
|
|
|
r.append("\n AtomicGroup: ");
|
|
|
|
AppendNumberTo(&r, remaining_entries_);
|
|
|
|
r.append(" entries remains");
|
|
|
|
}
|
|
|
|
r.append("\n}\n");
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
9 years ago
|
|
|
std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
|
|
|
|
JSONWriter jw;
|
|
|
|
jw << "EditNumber" << edit_num;
|
|
|
|
|
|
|
|
if (has_db_id_) {
|
|
|
|
jw << "DB ID" << db_id_;
|
|
|
|
}
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
9 years ago
|
|
|
if (has_comparator_) {
|
|
|
|
jw << "Comparator" << comparator_;
|
|
|
|
}
|
|
|
|
if (has_log_number_) {
|
|
|
|
jw << "LogNumber" << log_number_;
|
|
|
|
}
|
|
|
|
if (has_prev_log_number_) {
|
|
|
|
jw << "PrevLogNumber" << prev_log_number_;
|
|
|
|
}
|
|
|
|
if (has_next_file_number_) {
|
|
|
|
jw << "NextFileNumber" << next_file_number_;
|
|
|
|
}
|
|
|
|
if (has_max_column_family_) {
|
|
|
|
jw << "MaxColumnFamily" << max_column_family_;
|
|
|
|
}
|
|
|
|
if (has_min_log_number_to_keep_) {
|
|
|
|
jw << "MinLogNumberToKeep" << min_log_number_to_keep_;
|
|
|
|
}
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
9 years ago
|
|
|
if (has_last_sequence_) {
|
|
|
|
jw << "LastSeq" << last_sequence_;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!deleted_files_.empty()) {
|
|
|
|
jw << "DeletedFiles";
|
|
|
|
jw.StartArray();
|
|
|
|
|
|
|
|
for (const auto& deleted_file : deleted_files_) {
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
9 years ago
|
|
|
jw.StartArrayedObject();
|
|
|
|
jw << "Level" << deleted_file.first;
|
|
|
|
jw << "FileNumber" << deleted_file.second;
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
9 years ago
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!new_files_.empty()) {
|
|
|
|
jw << "AddedFiles";
|
|
|
|
jw.StartArray();
|
|
|
|
|
|
|
|
for (size_t i = 0; i < new_files_.size(); i++) {
|
|
|
|
jw.StartArrayedObject();
|
|
|
|
jw << "Level" << new_files_[i].first;
|
|
|
|
const FileMetaData& f = new_files_[i].second;
|
|
|
|
jw << "FileNumber" << f.fd.GetNumber();
|
|
|
|
jw << "FileSize" << f.fd.GetFileSize();
|
|
|
|
jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
|
|
|
|
jw << "LargestIKey" << f.largest.DebugString(hex_key);
|
|
|
|
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
|
|
|
|
jw << "OldestBlobFile" << f.oldest_blob_file_number;
|
|
|
|
}
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
9 years ago
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!blob_file_additions_.empty()) {
|
|
|
|
jw << "BlobFileAdditions";
|
|
|
|
|
|
|
|
jw.StartArray();
|
|
|
|
|
|
|
|
for (const auto& blob_file_addition : blob_file_additions_) {
|
|
|
|
jw.StartArrayedObject();
|
|
|
|
jw << blob_file_addition;
|
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!blob_file_garbages_.empty()) {
|
|
|
|
jw << "BlobFileGarbages";
|
|
|
|
|
|
|
|
jw.StartArray();
|
|
|
|
|
|
|
|
for (const auto& blob_file_garbage : blob_file_garbages_) {
|
|
|
|
jw.StartArrayedObject();
|
|
|
|
jw << blob_file_garbage;
|
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
if (!wal_additions_.empty()) {
|
|
|
|
jw << "WalAdditions";
|
|
|
|
|
|
|
|
jw.StartArray();
|
|
|
|
|
|
|
|
for (const auto& wal_addition : wal_additions_) {
|
|
|
|
jw.StartArrayedObject();
|
|
|
|
jw << wal_addition;
|
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!wal_deletion_.IsEmpty()) {
|
|
|
|
jw << "WalDeletion";
|
|
|
|
jw.StartObject();
|
|
|
|
jw << wal_deletion_;
|
|
|
|
jw.EndObject();
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
}
|
|
|
|
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
9 years ago
|
|
|
jw << "ColumnFamily" << column_family_;
|
|
|
|
|
|
|
|
if (is_column_family_add_) {
|
|
|
|
jw << "ColumnFamilyAdd" << column_family_name_;
|
|
|
|
}
|
|
|
|
if (is_column_family_drop_) {
|
|
|
|
jw << "ColumnFamilyDrop" << column_family_name_;
|
|
|
|
}
|
|
|
|
if (is_in_atomic_group_) {
|
|
|
|
jw << "AtomicGroup" << remaining_entries_;
|
|
|
|
}
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
9 years ago
|
|
|
|
|
|
|
jw.EndObject();
|
|
|
|
|
|
|
|
return jw.Get();
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|