|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/version_edit.h"
|
|
|
|
|
|
|
|
#include "db/blob/blob_index.h"
|
|
|
|
#include "db/version_set.h"
|
|
|
|
#include "logging/event_logger.h"
|
|
|
|
#include "rocksdb/slice.h"
|
|
|
|
#include "table/unique_id_impl.h"
|
|
|
|
#include "test_util/sync_point.h"
|
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/string_util.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
namespace {} // anonymous namespace
|
|
|
|
|
|
|
|
uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
|
|
|
|
assert(number <= kFileNumberMask);
|
|
|
|
return number | (path_id * (kFileNumberMask + 1));
|
|
|
|
}
|
|
|
|
|
|
|
|
Status FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
|
|
|
|
SequenceNumber seqno,
|
|
|
|
ValueType value_type) {
|
|
|
|
if (value_type == kTypeBlobIndex) {
|
|
|
|
BlobIndex blob_index;
|
|
|
|
const Status s = blob_index.DecodeFrom(value);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!blob_index.IsInlined() && !blob_index.HasTTL()) {
|
|
|
|
if (blob_index.file_number() == kInvalidBlobFileNumber) {
|
|
|
|
return Status::Corruption("Invalid blob file number");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (oldest_blob_file_number == kInvalidBlobFileNumber ||
|
|
|
|
oldest_blob_file_number > blob_index.file_number()) {
|
|
|
|
oldest_blob_file_number = blob_index.file_number();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (smallest.size() == 0) {
|
|
|
|
smallest.DecodeFrom(key);
|
|
|
|
}
|
|
|
|
largest.DecodeFrom(key);
|
|
|
|
fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
|
|
|
|
fd.largest_seqno = std::max(fd.largest_seqno, seqno);
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
void VersionEdit::Clear() {
|
|
|
|
max_level_ = 0;
|
|
|
|
db_id_.clear();
|
|
|
|
comparator_.clear();
|
|
|
|
log_number_ = 0;
|
|
|
|
prev_log_number_ = 0;
|
|
|
|
next_file_number_ = 0;
|
|
|
|
max_column_family_ = 0;
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
min_log_number_to_keep_ = 0;
|
|
|
|
last_sequence_ = 0;
|
|
|
|
has_db_id_ = false;
|
|
|
|
has_comparator_ = false;
|
|
|
|
has_log_number_ = false;
|
|
|
|
has_prev_log_number_ = false;
|
|
|
|
has_next_file_number_ = false;
|
|
|
|
has_max_column_family_ = false;
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
has_min_log_number_to_keep_ = false;
|
|
|
|
has_last_sequence_ = false;
|
|
|
|
compact_cursors_.clear();
|
|
|
|
deleted_files_.clear();
|
|
|
|
new_files_.clear();
|
|
|
|
blob_file_additions_.clear();
|
|
|
|
blob_file_garbages_.clear();
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
wal_additions_.clear();
|
|
|
|
wal_deletion_.Reset();
|
|
|
|
column_family_ = 0;
|
|
|
|
is_column_family_add_ = false;
|
|
|
|
is_column_family_drop_ = false;
|
|
|
|
column_family_name_.clear();
|
|
|
|
is_in_atomic_group_ = false;
|
|
|
|
remaining_entries_ = 0;
|
|
|
|
full_history_ts_low_.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool VersionEdit::EncodeTo(std::string* dst,
|
|
|
|
std::optional<size_t> ts_sz) const {
|
|
|
|
if (has_db_id_) {
|
|
|
|
PutVarint32(dst, kDbId);
|
|
|
|
PutLengthPrefixedSlice(dst, db_id_);
|
|
|
|
}
|
|
|
|
if (has_comparator_) {
|
|
|
|
PutVarint32(dst, kComparator);
|
|
|
|
PutLengthPrefixedSlice(dst, comparator_);
|
|
|
|
}
|
|
|
|
if (has_log_number_) {
|
|
|
|
PutVarint32Varint64(dst, kLogNumber, log_number_);
|
|
|
|
}
|
|
|
|
if (has_prev_log_number_) {
|
|
|
|
PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_);
|
|
|
|
}
|
|
|
|
if (has_next_file_number_) {
|
|
|
|
PutVarint32Varint64(dst, kNextFileNumber, next_file_number_);
|
|
|
|
}
|
|
|
|
if (has_max_column_family_) {
|
|
|
|
PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_);
|
|
|
|
}
|
|
|
|
if (has_min_log_number_to_keep_) {
|
|
|
|
PutVarint32Varint64(dst, kMinLogNumberToKeep, min_log_number_to_keep_);
|
|
|
|
}
|
|
|
|
if (has_last_sequence_) {
|
|
|
|
PutVarint32Varint64(dst, kLastSequence, last_sequence_);
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i < compact_cursors_.size(); i++) {
|
|
|
|
if (compact_cursors_[i].second.Valid()) {
|
|
|
|
PutVarint32(dst, kCompactCursor);
|
|
|
|
PutVarint32(dst, compact_cursors_[i].first); // level
|
|
|
|
PutLengthPrefixedSlice(dst, compact_cursors_[i].second.Encode());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (const auto& deleted : deleted_files_) {
|
|
|
|
PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */,
|
|
|
|
deleted.second /* file number */);
|
|
|
|
}
|
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
bool min_log_num_written = false;
|
|
|
|
|
|
|
|
assert(new_files_.empty() || ts_sz.has_value());
|
|
|
|
for (size_t i = 0; i < new_files_.size(); i++) {
|
|
|
|
const FileMetaData& f = new_files_[i].second;
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
if (!f.smallest.Valid() || !f.largest.Valid() ||
|
|
|
|
f.epoch_number == kUnknownEpochNumber) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
PutVarint32(dst, kNewFile4);
|
|
|
|
PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber());
|
|
|
|
PutVarint64(dst, f.fd.GetFileSize());
|
|
|
|
EncodeFileBoundaries(dst, f, ts_sz.value());
|
|
|
|
PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
|
|
|
|
// Customized fields' format:
|
|
|
|
// +-----------------------------+
|
|
|
|
// | 1st field's tag (varint32) |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | 1st field's size (varint32) |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | bytes for 1st field |
|
|
|
|
// | (based on size decoded) |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | |
|
|
|
|
// | ...... |
|
|
|
|
// | |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | last field's size (varint32)|
|
|
|
|
// +-----------------------------+
|
|
|
|
// | bytes for last field |
|
|
|
|
// | (based on size decoded) |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | terminating tag (varint32) |
|
|
|
|
// +-----------------------------+
|
|
|
|
//
|
|
|
|
// Customized encoding for fields:
|
|
|
|
// tag kPathId: 1 byte as path_id
|
|
|
|
// tag kNeedCompaction:
|
|
|
|
// now only can take one char value 1 indicating need-compaction
|
|
|
|
//
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime);
|
|
|
|
std::string varint_oldest_ancester_time;
|
|
|
|
PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
|
|
|
|
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
|
|
|
|
&varint_oldest_ancester_time);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
|
|
|
|
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kFileCreationTime);
|
|
|
|
std::string varint_file_creation_time;
|
|
|
|
PutVarint64(&varint_file_creation_time, f.file_creation_time);
|
|
|
|
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
|
|
|
|
&varint_file_creation_time);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
|
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
PutVarint32(dst, NewFileCustomTag::kEpochNumber);
|
|
|
|
std::string varint_epoch_number;
|
|
|
|
PutVarint64(&varint_epoch_number, f.epoch_number);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(varint_epoch_number));
|
|
|
|
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kFileChecksum);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
|
|
|
|
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name));
|
|
|
|
|
|
|
|
if (f.fd.GetPathId() != 0) {
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kPathId);
|
|
|
|
char p = static_cast<char>(f.fd.GetPathId());
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(&p, 1));
|
|
|
|
}
|
|
|
|
if (f.temperature != Temperature::kUnknown) {
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kTemperature);
|
|
|
|
char p = static_cast<char>(f.temperature);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(&p, 1));
|
|
|
|
}
|
|
|
|
if (f.marked_for_compaction) {
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kNeedCompaction);
|
|
|
|
char p = static_cast<char>(1);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(&p, 1));
|
|
|
|
}
|
|
|
|
if (has_min_log_number_to_keep_ && !min_log_num_written) {
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack);
|
|
|
|
std::string varint_log_number;
|
|
|
|
PutFixed64(&varint_log_number, min_log_number_to_keep_);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(varint_log_number));
|
|
|
|
min_log_num_written = true;
|
|
|
|
}
|
|
|
|
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber);
|
|
|
|
std::string oldest_blob_file_number;
|
|
|
|
PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
|
|
|
|
}
|
|
|
|
UniqueId64x2 unique_id = f.unique_id;
|
|
|
|
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id);
|
|
|
|
if (unique_id != kNullUniqueId64x2) {
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kUniqueId);
|
|
|
|
std::string unique_id_str = EncodeUniqueIdBytes(&unique_id);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(unique_id_str));
|
|
|
|
}
|
|
|
|
if (f.compensated_range_deletion_size) {
|
|
|
|
PutVarint32(dst, kCompensatedRangeDeletionSize);
|
|
|
|
std::string compensated_range_deletion_size;
|
|
|
|
PutVarint64(&compensated_range_deletion_size,
|
|
|
|
f.compensated_range_deletion_size);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size));
|
|
|
|
}
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2 years ago
|
|
|
if (f.tail_size) {
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kTailSize);
|
|
|
|
std::string varint_tail_size;
|
|
|
|
PutVarint64(&varint_tail_size, f.tail_size);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(varint_tail_size));
|
|
|
|
}
|
|
|
|
if (!f.user_defined_timestamps_persisted) {
|
|
|
|
// The default value for the flag is true, it's only explicitly persisted
|
|
|
|
// when it's false. We are putting 0 as the value here to signal false
|
|
|
|
// (i.e. UDTS not persisted).
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kUserDefinedTimestampsPersisted);
|
|
|
|
char p = static_cast<char>(0);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(&p, 1));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
|
|
|
|
dst);
|
|
|
|
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kTerminate);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto& blob_file_addition : blob_file_additions_) {
|
|
|
|
PutVarint32(dst, kBlobFileAddition);
|
|
|
|
blob_file_addition.EncodeTo(dst);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto& blob_file_garbage : blob_file_garbages_) {
|
|
|
|
PutVarint32(dst, kBlobFileGarbage);
|
|
|
|
blob_file_garbage.EncodeTo(dst);
|
|
|
|
}
|
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
for (const auto& wal_addition : wal_additions_) {
|
Make it able to ignore WAL related VersionEdits in older versions (#7873)
Summary:
Although the tags for `WalAddition`, `WalDeletion` are after `kTagSafeIgnoreMask`, to actually be able to skip these entries in older versions of RocksDB, we require that they are encoded with their encoded size as the prefix. This requirement is not met in the current codebase, so a downgraded DB may fail to open if these entries exist in the MANIFEST.
If a DB wants to downgrade, and its MANIFEST contains `WalAddition` or `WalDeletion`, it can set `track_and_verify_wals_in_manifest` to `false`, then restart twice, then downgrade. On the first restart, a new MANIFEST will be created with a `WalDeletion` indicating that all previously tracked WALs are removed from MANIFEST. On the second restart, since there is no tracked WALs in MANIFEST now, a new MANIFEST will be created with neither `WalAddition` nor `WalDeletion`. Then the DB can downgrade.
Tags for `BlobFileAddition`, `BlobFileGarbage` also have the same problem, but this PR focuses on solving the problem for WAL edits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7873
Test Plan: Added a `VersionEditTest::IgnorableTags` unit test to verify all entries with tags larger than `kTagSafeIgnoreMask` can actually be skipped and won't affect parsing of other entries.
Reviewed By: ajkr
Differential Revision: D25935930
Pulled By: cheng-chang
fbshipit-source-id: 7a02fdba4311d6084328c14aed110a26d08c3efb
4 years ago
|
|
|
PutVarint32(dst, kWalAddition2);
|
|
|
|
std::string encoded;
|
|
|
|
wal_addition.EncodeTo(&encoded);
|
|
|
|
PutLengthPrefixedSlice(dst, encoded);
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
}
|
|
|
|
|
|
|
|
if (!wal_deletion_.IsEmpty()) {
|
Make it able to ignore WAL related VersionEdits in older versions (#7873)
Summary:
Although the tags for `WalAddition`, `WalDeletion` are after `kTagSafeIgnoreMask`, to actually be able to skip these entries in older versions of RocksDB, we require that they are encoded with their encoded size as the prefix. This requirement is not met in the current codebase, so a downgraded DB may fail to open if these entries exist in the MANIFEST.
If a DB wants to downgrade, and its MANIFEST contains `WalAddition` or `WalDeletion`, it can set `track_and_verify_wals_in_manifest` to `false`, then restart twice, then downgrade. On the first restart, a new MANIFEST will be created with a `WalDeletion` indicating that all previously tracked WALs are removed from MANIFEST. On the second restart, since there is no tracked WALs in MANIFEST now, a new MANIFEST will be created with neither `WalAddition` nor `WalDeletion`. Then the DB can downgrade.
Tags for `BlobFileAddition`, `BlobFileGarbage` also have the same problem, but this PR focuses on solving the problem for WAL edits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7873
Test Plan: Added a `VersionEditTest::IgnorableTags` unit test to verify all entries with tags larger than `kTagSafeIgnoreMask` can actually be skipped and won't affect parsing of other entries.
Reviewed By: ajkr
Differential Revision: D25935930
Pulled By: cheng-chang
fbshipit-source-id: 7a02fdba4311d6084328c14aed110a26d08c3efb
4 years ago
|
|
|
PutVarint32(dst, kWalDeletion2);
|
|
|
|
std::string encoded;
|
|
|
|
wal_deletion_.EncodeTo(&encoded);
|
|
|
|
PutLengthPrefixedSlice(dst, encoded);
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// 0 is default and does not need to be explicitly written
|
|
|
|
if (column_family_ != 0) {
|
|
|
|
PutVarint32Varint32(dst, kColumnFamily, column_family_);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_column_family_add_) {
|
|
|
|
PutVarint32(dst, kColumnFamilyAdd);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(column_family_name_));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_column_family_drop_) {
|
|
|
|
PutVarint32(dst, kColumnFamilyDrop);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_in_atomic_group_) {
|
|
|
|
PutVarint32(dst, kInAtomicGroup);
|
|
|
|
PutVarint32(dst, remaining_entries_);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (HasFullHistoryTsLow()) {
|
|
|
|
PutVarint32(dst, kFullHistoryTsLow);
|
|
|
|
PutLengthPrefixedSlice(dst, full_history_ts_low_);
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool GetInternalKey(Slice* input, InternalKey* dst) {
|
|
|
|
Slice str;
|
|
|
|
if (GetLengthPrefixedSlice(input, &str)) {
|
|
|
|
dst->DecodeFrom(str);
|
[fix] SIGSEGV when VersionEdit in MANIFEST is corrupted
Summary:
This was reported by our customers in task #4295529.
Cause:
* MANIFEST file contains a VersionEdit, which contains file entries whose 'smallest' and 'largest' internal keys are empty. String with zero characters. Root cause of corruption was not investigated. We should report corruption when this happens. However, we currently SIGSEGV.
Here's what happens:
* VersionEdit encodes zero-strings happily and stores them in smallest and largest InternalKeys. InternalKey::Encode() does assert when `rep_.empty()`, but we don't assert in production environemnts. Also, we should never assert as a result of DB corruption.
* As part of our ConsistencyCheck, we call GetLiveFilesMetaData()
* GetLiveFilesMetadata() calls `file->largest.user_key().ToString()`
* user_key() function does: 1. assert(size > 8) (ooops, no assert), 2. returns `Slice(internal_key.data(), internal_key.size() - 8)`
* since `internal_key.size()` is unsigned int, this call translates to `Slice(whatever, 1298471928561892576182756)`. Bazinga.
Fix:
* VersionEdit checks if InternalKey is valid in `VersionEdit::GetInternalKey()`. If it's invalid, returns corruption.
Lessons learned:
* Always keep in mind that even if you `assert()`, production code will continue execution even if assert fails.
* Never `assert` based on DB corruption. Assert only if the code should guarantee that assert can't fail.
Test Plan: dumped offending manifest. Before: assert. Now: corruption
Reviewers: dhruba, haobo, sdong
Reviewed By: dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D18507
11 years ago
|
|
|
return dst->Valid();
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
|
|
|
|
uint32_t v = 0;
|
|
|
|
if (GetVarint32(input, &v)) {
|
|
|
|
*level = v;
|
|
|
|
if (max_level_ < *level) {
|
|
|
|
max_level_ = *level;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const char* VersionEdit::DecodeNewFile4From(Slice* input) {
|
|
|
|
const char* msg = nullptr;
|
|
|
|
int level = 0;
|
|
|
|
FileMetaData f;
|
|
|
|
uint64_t number = 0;
|
|
|
|
uint32_t path_id = 0;
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
SequenceNumber smallest_seqno = 0;
|
|
|
|
SequenceNumber largest_seqno = kMaxSequenceNumber;
|
|
|
|
if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
|
|
|
|
GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
|
|
|
|
GetInternalKey(input, &f.largest) &&
|
|
|
|
GetVarint64(input, &smallest_seqno) &&
|
|
|
|
GetVarint64(input, &largest_seqno)) {
|
|
|
|
// See comments in VersionEdit::EncodeTo() for format of customized fields
|
|
|
|
while (true) {
|
|
|
|
uint32_t custom_tag = 0;
|
|
|
|
Slice field;
|
|
|
|
if (!GetVarint32(input, &custom_tag)) {
|
|
|
|
return "new-file4 custom field";
|
|
|
|
}
|
|
|
|
if (custom_tag == kTerminate) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!GetLengthPrefixedSlice(input, &field)) {
|
|
|
|
return "new-file4 custom field length prefixed slice error";
|
|
|
|
}
|
|
|
|
switch (custom_tag) {
|
|
|
|
case kPathId:
|
|
|
|
if (field.size() != 1) {
|
|
|
|
return "path_id field wrong size";
|
|
|
|
}
|
|
|
|
path_id = field[0];
|
|
|
|
if (path_id > 3) {
|
|
|
|
return "path_id wrong vaue";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kOldestAncesterTime:
|
|
|
|
if (!GetVarint64(&field, &f.oldest_ancester_time)) {
|
|
|
|
return "invalid oldest ancester time";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kFileCreationTime:
|
|
|
|
if (!GetVarint64(&field, &f.file_creation_time)) {
|
|
|
|
return "invalid file creation time";
|
|
|
|
}
|
|
|
|
break;
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
case kEpochNumber:
|
|
|
|
if (!GetVarint64(&field, &f.epoch_number)) {
|
|
|
|
return "invalid epoch number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kFileChecksum:
|
|
|
|
f.file_checksum = field.ToString();
|
|
|
|
break;
|
|
|
|
case kFileChecksumFuncName:
|
|
|
|
f.file_checksum_func_name = field.ToString();
|
|
|
|
break;
|
|
|
|
case kNeedCompaction:
|
|
|
|
if (field.size() != 1) {
|
|
|
|
return "need_compaction field wrong size";
|
|
|
|
}
|
|
|
|
f.marked_for_compaction = (field[0] == 1);
|
|
|
|
break;
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
case kMinLogNumberToKeepHack:
|
|
|
|
// This is a hack to encode kMinLogNumberToKeep in a
|
|
|
|
// forward-compatible fashion.
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
if (!GetFixed64(&field, &min_log_number_to_keep_)) {
|
|
|
|
return "deleted log number malformatted";
|
|
|
|
}
|
|
|
|
has_min_log_number_to_keep_ = true;
|
|
|
|
break;
|
|
|
|
case kOldestBlobFileNumber:
|
|
|
|
if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
|
|
|
|
return "invalid oldest blob file number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kTemperature:
|
|
|
|
if (field.size() != 1) {
|
|
|
|
return "temperature field wrong size";
|
|
|
|
} else {
|
|
|
|
Temperature casted_field = static_cast<Temperature>(field[0]);
|
|
|
|
if (casted_field <= Temperature::kCold) {
|
|
|
|
f.temperature = casted_field;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kUniqueId:
|
|
|
|
if (!DecodeUniqueIdBytes(field.ToString(), &f.unique_id).ok()) {
|
|
|
|
f.unique_id = kNullUniqueId64x2;
|
|
|
|
return "invalid unique id";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kCompensatedRangeDeletionSize:
|
|
|
|
if (!GetVarint64(&field, &f.compensated_range_deletion_size)) {
|
|
|
|
return "Invalid compensated range deletion size";
|
|
|
|
}
|
|
|
|
break;
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2 years ago
|
|
|
case kTailSize:
|
|
|
|
if (!GetVarint64(&field, &f.tail_size)) {
|
|
|
|
return "invalid tail start offset";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kUserDefinedTimestampsPersisted:
|
|
|
|
if (field.size() != 1) {
|
|
|
|
return "user-defined timestamps persisted field wrong size";
|
|
|
|
}
|
|
|
|
f.user_defined_timestamps_persisted = (field[0] == 1);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
|
|
|
|
// Should not proceed if cannot understand it
|
|
|
|
return "new-file4 custom field not supported";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return "new-file4 entry";
|
|
|
|
}
|
|
|
|
f.fd =
|
|
|
|
FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno);
|
|
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
void VersionEdit::EncodeFileBoundaries(std::string* dst,
|
|
|
|
const FileMetaData& meta,
|
|
|
|
size_t ts_sz) const {
|
|
|
|
if (ts_sz == 0 || meta.user_defined_timestamps_persisted) {
|
|
|
|
PutLengthPrefixedSlice(dst, meta.smallest.Encode());
|
|
|
|
PutLengthPrefixedSlice(dst, meta.largest.Encode());
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
std::string smallest_buf;
|
|
|
|
std::string largest_buf;
|
|
|
|
StripTimestampFromInternalKey(&smallest_buf, meta.smallest.Encode(), ts_sz);
|
|
|
|
StripTimestampFromInternalKey(&largest_buf, meta.largest.Encode(), ts_sz);
|
|
|
|
PutLengthPrefixedSlice(dst, smallest_buf);
|
|
|
|
PutLengthPrefixedSlice(dst, largest_buf);
|
|
|
|
return;
|
|
|
|
};
|
|
|
|
|
|
|
|
Status VersionEdit::DecodeFrom(const Slice& src) {
|
|
|
|
Clear();
|
Make it able to ignore WAL related VersionEdits in older versions (#7873)
Summary:
Although the tags for `WalAddition`, `WalDeletion` are after `kTagSafeIgnoreMask`, to actually be able to skip these entries in older versions of RocksDB, we require that they are encoded with their encoded size as the prefix. This requirement is not met in the current codebase, so a downgraded DB may fail to open if these entries exist in the MANIFEST.
If a DB wants to downgrade, and its MANIFEST contains `WalAddition` or `WalDeletion`, it can set `track_and_verify_wals_in_manifest` to `false`, then restart twice, then downgrade. On the first restart, a new MANIFEST will be created with a `WalDeletion` indicating that all previously tracked WALs are removed from MANIFEST. On the second restart, since there is no tracked WALs in MANIFEST now, a new MANIFEST will be created with neither `WalAddition` nor `WalDeletion`. Then the DB can downgrade.
Tags for `BlobFileAddition`, `BlobFileGarbage` also have the same problem, but this PR focuses on solving the problem for WAL edits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7873
Test Plan: Added a `VersionEditTest::IgnorableTags` unit test to verify all entries with tags larger than `kTagSafeIgnoreMask` can actually be skipped and won't affect parsing of other entries.
Reviewed By: ajkr
Differential Revision: D25935930
Pulled By: cheng-chang
fbshipit-source-id: 7a02fdba4311d6084328c14aed110a26d08c3efb
4 years ago
|
|
|
#ifndef NDEBUG
|
|
|
|
bool ignore_ignorable_tags = false;
|
|
|
|
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:IgnoreIgnorableTags",
|
|
|
|
&ignore_ignorable_tags);
|
|
|
|
#endif
|
|
|
|
Slice input = src;
|
|
|
|
const char* msg = nullptr;
|
|
|
|
uint32_t tag = 0;
|
|
|
|
|
|
|
|
// Temporary storage for parsing
|
|
|
|
int level = 0;
|
|
|
|
FileMetaData f;
|
|
|
|
Slice str;
|
|
|
|
InternalKey key;
|
|
|
|
while (msg == nullptr && GetVarint32(&input, &tag)) {
|
Make it able to ignore WAL related VersionEdits in older versions (#7873)
Summary:
Although the tags for `WalAddition`, `WalDeletion` are after `kTagSafeIgnoreMask`, to actually be able to skip these entries in older versions of RocksDB, we require that they are encoded with their encoded size as the prefix. This requirement is not met in the current codebase, so a downgraded DB may fail to open if these entries exist in the MANIFEST.
If a DB wants to downgrade, and its MANIFEST contains `WalAddition` or `WalDeletion`, it can set `track_and_verify_wals_in_manifest` to `false`, then restart twice, then downgrade. On the first restart, a new MANIFEST will be created with a `WalDeletion` indicating that all previously tracked WALs are removed from MANIFEST. On the second restart, since there is no tracked WALs in MANIFEST now, a new MANIFEST will be created with neither `WalAddition` nor `WalDeletion`. Then the DB can downgrade.
Tags for `BlobFileAddition`, `BlobFileGarbage` also have the same problem, but this PR focuses on solving the problem for WAL edits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7873
Test Plan: Added a `VersionEditTest::IgnorableTags` unit test to verify all entries with tags larger than `kTagSafeIgnoreMask` can actually be skipped and won't affect parsing of other entries.
Reviewed By: ajkr
Differential Revision: D25935930
Pulled By: cheng-chang
fbshipit-source-id: 7a02fdba4311d6084328c14aed110a26d08c3efb
4 years ago
|
|
|
#ifndef NDEBUG
|
|
|
|
if (ignore_ignorable_tags && tag > kTagSafeIgnoreMask) {
|
|
|
|
tag = kTagSafeIgnoreMask;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
switch (tag) {
|
|
|
|
case kDbId:
|
|
|
|
if (GetLengthPrefixedSlice(&input, &str)) {
|
|
|
|
db_id_ = str.ToString();
|
|
|
|
has_db_id_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "db id";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kComparator:
|
|
|
|
if (GetLengthPrefixedSlice(&input, &str)) {
|
|
|
|
comparator_ = str.ToString();
|
|
|
|
has_comparator_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "comparator name";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kLogNumber:
|
|
|
|
if (GetVarint64(&input, &log_number_)) {
|
|
|
|
has_log_number_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "log number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kPrevLogNumber:
|
|
|
|
if (GetVarint64(&input, &prev_log_number_)) {
|
|
|
|
has_prev_log_number_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "previous log number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kNextFileNumber:
|
|
|
|
if (GetVarint64(&input, &next_file_number_)) {
|
|
|
|
has_next_file_number_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "next file number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kMaxColumnFamily:
|
|
|
|
if (GetVarint32(&input, &max_column_family_)) {
|
|
|
|
has_max_column_family_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "max column family";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
case kMinLogNumberToKeep:
|
|
|
|
if (GetVarint64(&input, &min_log_number_to_keep_)) {
|
|
|
|
has_min_log_number_to_keep_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "min log number to kee";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kLastSequence:
|
|
|
|
if (GetVarint64(&input, &last_sequence_)) {
|
|
|
|
has_last_sequence_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "last sequence number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kCompactCursor:
|
|
|
|
if (GetLevel(&input, &level, &msg) && GetInternalKey(&input, &key)) {
|
|
|
|
// Here we re-use the output format of compact pointer in LevelDB
|
|
|
|
// to persist compact_cursors_
|
|
|
|
compact_cursors_.push_back(std::make_pair(level, key));
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "compaction cursor";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kDeletedFile: {
|
|
|
|
uint64_t number = 0;
|
|
|
|
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
|
|
|
|
deleted_files_.insert(std::make_pair(level, number));
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "deleted file";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kNewFile: {
|
|
|
|
uint64_t number = 0;
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
|
|
|
|
GetVarint64(&input, &file_size) &&
|
|
|
|
GetInternalKey(&input, &f.smallest) &&
|
|
|
|
GetInternalKey(&input, &f.largest)) {
|
|
|
|
f.fd = FileDescriptor(number, 0, file_size);
|
|
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "new-file entry";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case kNewFile2: {
|
|
|
|
uint64_t number = 0;
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
SequenceNumber smallest_seqno = 0;
|
|
|
|
SequenceNumber largest_seqno = kMaxSequenceNumber;
|
|
|
|
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
|
|
|
|
GetVarint64(&input, &file_size) &&
|
|
|
|
GetInternalKey(&input, &f.smallest) &&
|
|
|
|
GetInternalKey(&input, &f.largest) &&
|
|
|
|
GetVarint64(&input, &smallest_seqno) &&
|
|
|
|
GetVarint64(&input, &largest_seqno)) {
|
|
|
|
f.fd = FileDescriptor(number, 0, file_size, smallest_seqno,
|
|
|
|
largest_seqno);
|
|
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "new-file2 entry";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kNewFile3: {
|
|
|
|
uint64_t number = 0;
|
|
|
|
uint32_t path_id = 0;
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
SequenceNumber smallest_seqno = 0;
|
|
|
|
SequenceNumber largest_seqno = kMaxSequenceNumber;
|
|
|
|
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
|
|
|
|
GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
|
|
|
|
GetInternalKey(&input, &f.smallest) &&
|
|
|
|
GetInternalKey(&input, &f.largest) &&
|
|
|
|
GetVarint64(&input, &smallest_seqno) &&
|
|
|
|
GetVarint64(&input, &largest_seqno)) {
|
|
|
|
f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno,
|
|
|
|
largest_seqno);
|
|
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "new-file3 entry";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kNewFile4: {
|
|
|
|
msg = DecodeNewFile4From(&input);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kBlobFileAddition:
|
|
|
|
case kBlobFileAddition_DEPRECATED: {
|
|
|
|
BlobFileAddition blob_file_addition;
|
|
|
|
const Status s = blob_file_addition.DecodeFrom(&input);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
AddBlobFile(std::move(blob_file_addition));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kBlobFileGarbage:
|
|
|
|
case kBlobFileGarbage_DEPRECATED: {
|
|
|
|
BlobFileGarbage blob_file_garbage;
|
|
|
|
const Status s = blob_file_garbage.DecodeFrom(&input);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
AddBlobFileGarbage(std::move(blob_file_garbage));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
case kWalAddition: {
|
|
|
|
WalAddition wal_addition;
|
|
|
|
const Status s = wal_addition.DecodeFrom(&input);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
wal_additions_.emplace_back(std::move(wal_addition));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
Make it able to ignore WAL related VersionEdits in older versions (#7873)
Summary:
Although the tags for `WalAddition`, `WalDeletion` are after `kTagSafeIgnoreMask`, to actually be able to skip these entries in older versions of RocksDB, we require that they are encoded with their encoded size as the prefix. This requirement is not met in the current codebase, so a downgraded DB may fail to open if these entries exist in the MANIFEST.
If a DB wants to downgrade, and its MANIFEST contains `WalAddition` or `WalDeletion`, it can set `track_and_verify_wals_in_manifest` to `false`, then restart twice, then downgrade. On the first restart, a new MANIFEST will be created with a `WalDeletion` indicating that all previously tracked WALs are removed from MANIFEST. On the second restart, since there is no tracked WALs in MANIFEST now, a new MANIFEST will be created with neither `WalAddition` nor `WalDeletion`. Then the DB can downgrade.
Tags for `BlobFileAddition`, `BlobFileGarbage` also have the same problem, but this PR focuses on solving the problem for WAL edits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7873
Test Plan: Added a `VersionEditTest::IgnorableTags` unit test to verify all entries with tags larger than `kTagSafeIgnoreMask` can actually be skipped and won't affect parsing of other entries.
Reviewed By: ajkr
Differential Revision: D25935930
Pulled By: cheng-chang
fbshipit-source-id: 7a02fdba4311d6084328c14aed110a26d08c3efb
4 years ago
|
|
|
case kWalAddition2: {
|
|
|
|
Slice encoded;
|
|
|
|
if (!GetLengthPrefixedSlice(&input, &encoded)) {
|
|
|
|
msg = "WalAddition not prefixed by length";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
WalAddition wal_addition;
|
|
|
|
const Status s = wal_addition.DecodeFrom(&encoded);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
wal_additions_.emplace_back(std::move(wal_addition));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
case kWalDeletion: {
|
|
|
|
WalDeletion wal_deletion;
|
|
|
|
const Status s = wal_deletion.DecodeFrom(&input);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
wal_deletion_ = std::move(wal_deletion);
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
Make it able to ignore WAL related VersionEdits in older versions (#7873)
Summary:
Although the tags for `WalAddition`, `WalDeletion` are after `kTagSafeIgnoreMask`, to actually be able to skip these entries in older versions of RocksDB, we require that they are encoded with their encoded size as the prefix. This requirement is not met in the current codebase, so a downgraded DB may fail to open if these entries exist in the MANIFEST.
If a DB wants to downgrade, and its MANIFEST contains `WalAddition` or `WalDeletion`, it can set `track_and_verify_wals_in_manifest` to `false`, then restart twice, then downgrade. On the first restart, a new MANIFEST will be created with a `WalDeletion` indicating that all previously tracked WALs are removed from MANIFEST. On the second restart, since there is no tracked WALs in MANIFEST now, a new MANIFEST will be created with neither `WalAddition` nor `WalDeletion`. Then the DB can downgrade.
Tags for `BlobFileAddition`, `BlobFileGarbage` also have the same problem, but this PR focuses on solving the problem for WAL edits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7873
Test Plan: Added a `VersionEditTest::IgnorableTags` unit test to verify all entries with tags larger than `kTagSafeIgnoreMask` can actually be skipped and won't affect parsing of other entries.
Reviewed By: ajkr
Differential Revision: D25935930
Pulled By: cheng-chang
fbshipit-source-id: 7a02fdba4311d6084328c14aed110a26d08c3efb
4 years ago
|
|
|
case kWalDeletion2: {
|
|
|
|
Slice encoded;
|
|
|
|
if (!GetLengthPrefixedSlice(&input, &encoded)) {
|
|
|
|
msg = "WalDeletion not prefixed by length";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
WalDeletion wal_deletion;
|
|
|
|
const Status s = wal_deletion.DecodeFrom(&encoded);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
wal_deletion_ = std::move(wal_deletion);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kColumnFamily:
|
|
|
|
if (!GetVarint32(&input, &column_family_)) {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "set column family id";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kColumnFamilyAdd:
|
|
|
|
if (GetLengthPrefixedSlice(&input, &str)) {
|
|
|
|
is_column_family_add_ = true;
|
|
|
|
column_family_name_ = str.ToString();
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "column family add";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kColumnFamilyDrop:
|
|
|
|
is_column_family_drop_ = true;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kInAtomicGroup:
|
|
|
|
is_in_atomic_group_ = true;
|
|
|
|
if (!GetVarint32(&input, &remaining_entries_)) {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "remaining entries";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kFullHistoryTsLow:
|
|
|
|
if (!GetLengthPrefixedSlice(&input, &str)) {
|
|
|
|
msg = "full_history_ts_low";
|
|
|
|
} else if (str.empty()) {
|
|
|
|
msg = "full_history_ts_low: empty";
|
|
|
|
} else {
|
|
|
|
full_history_ts_low_.assign(str.data(), str.size());
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
if (tag & kTagSafeIgnoreMask) {
|
|
|
|
// Tag from future which can be safely ignored.
|
|
|
|
// The next field must be the length of the entry.
|
|
|
|
uint32_t field_len;
|
|
|
|
if (!GetVarint32(&input, &field_len) ||
|
|
|
|
static_cast<size_t>(field_len) > input.size()) {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "safely ignoreable tag length error";
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
input.remove_prefix(static_cast<size_t>(field_len));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
msg = "unknown tag";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (msg == nullptr && !input.empty()) {
|
|
|
|
msg = "invalid tag";
|
|
|
|
}
|
|
|
|
|
|
|
|
Status result;
|
|
|
|
if (msg != nullptr) {
|
|
|
|
result = Status::Corruption("VersionEdit", msg);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string VersionEdit::DebugString(bool hex_key) const {
|
|
|
|
std::string r;
|
|
|
|
r.append("VersionEdit {");
|
|
|
|
if (has_db_id_) {
|
|
|
|
r.append("\n DB ID: ");
|
|
|
|
r.append(db_id_);
|
|
|
|
}
|
|
|
|
if (has_comparator_) {
|
|
|
|
r.append("\n Comparator: ");
|
|
|
|
r.append(comparator_);
|
|
|
|
}
|
|
|
|
if (has_log_number_) {
|
|
|
|
r.append("\n LogNumber: ");
|
|
|
|
AppendNumberTo(&r, log_number_);
|
|
|
|
}
|
|
|
|
if (has_prev_log_number_) {
|
|
|
|
r.append("\n PrevLogNumber: ");
|
|
|
|
AppendNumberTo(&r, prev_log_number_);
|
|
|
|
}
|
|
|
|
if (has_next_file_number_) {
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
10 years ago
|
|
|
r.append("\n NextFileNumber: ");
|
|
|
|
AppendNumberTo(&r, next_file_number_);
|
|
|
|
}
|
|
|
|
if (has_max_column_family_) {
|
|
|
|
r.append("\n MaxColumnFamily: ");
|
|
|
|
AppendNumberTo(&r, max_column_family_);
|
|
|
|
}
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
if (has_min_log_number_to_keep_) {
|
|
|
|
r.append("\n MinLogNumberToKeep: ");
|
|
|
|
AppendNumberTo(&r, min_log_number_to_keep_);
|
|
|
|
}
|
|
|
|
if (has_last_sequence_) {
|
|
|
|
r.append("\n LastSeq: ");
|
|
|
|
AppendNumberTo(&r, last_sequence_);
|
|
|
|
}
|
|
|
|
for (const auto& level_and_compact_cursor : compact_cursors_) {
|
|
|
|
r.append("\n CompactCursor: ");
|
|
|
|
AppendNumberTo(&r, level_and_compact_cursor.first);
|
|
|
|
r.append(" ");
|
|
|
|
r.append(level_and_compact_cursor.second.DebugString(hex_key));
|
|
|
|
}
|
|
|
|
for (const auto& deleted_file : deleted_files_) {
|
|
|
|
r.append("\n DeleteFile: ");
|
|
|
|
AppendNumberTo(&r, deleted_file.first);
|
|
|
|
r.append(" ");
|
|
|
|
AppendNumberTo(&r, deleted_file.second);
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i < new_files_.size(); i++) {
|
|
|
|
const FileMetaData& f = new_files_[i].second;
|
|
|
|
r.append("\n AddFile: ");
|
|
|
|
AppendNumberTo(&r, new_files_[i].first);
|
|
|
|
r.append(" ");
|
|
|
|
AppendNumberTo(&r, f.fd.GetNumber());
|
|
|
|
r.append(" ");
|
|
|
|
AppendNumberTo(&r, f.fd.GetFileSize());
|
|
|
|
r.append(" ");
|
|
|
|
r.append(f.smallest.DebugString(hex_key));
|
|
|
|
r.append(" .. ");
|
|
|
|
r.append(f.largest.DebugString(hex_key));
|
|
|
|
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
|
|
|
|
r.append(" blob_file:");
|
|
|
|
AppendNumberTo(&r, f.oldest_blob_file_number);
|
|
|
|
}
|
|
|
|
r.append(" oldest_ancester_time:");
|
|
|
|
AppendNumberTo(&r, f.oldest_ancester_time);
|
|
|
|
r.append(" file_creation_time:");
|
|
|
|
AppendNumberTo(&r, f.file_creation_time);
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
r.append(" epoch_number:");
|
|
|
|
AppendNumberTo(&r, f.epoch_number);
|
|
|
|
r.append(" file_checksum:");
|
|
|
|
r.append(Slice(f.file_checksum).ToString(true));
|
|
|
|
r.append(" file_checksum_func_name: ");
|
|
|
|
r.append(f.file_checksum_func_name);
|
|
|
|
if (f.temperature != Temperature::kUnknown) {
|
|
|
|
r.append(" temperature: ");
|
|
|
|
// Maybe change to human readable format whenthe feature becomes
|
|
|
|
// permanent
|
|
|
|
r.append(std::to_string(static_cast<int>(f.temperature)));
|
|
|
|
}
|
|
|
|
if (f.unique_id != kNullUniqueId64x2) {
|
|
|
|
r.append(" unique_id(internal): ");
|
|
|
|
UniqueId64x2 id = f.unique_id;
|
|
|
|
r.append(InternalUniqueIdToHumanString(&id));
|
|
|
|
r.append(" public_unique_id: ");
|
|
|
|
InternalUniqueIdToExternal(&id);
|
|
|
|
r.append(UniqueIdToHumanString(EncodeUniqueIdBytes(&id)));
|
|
|
|
}
|
|
|
|
r.append(" tail size: ");
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2 years ago
|
|
|
AppendNumberTo(&r, f.tail_size);
|
|
|
|
r.append(" User-defined timestamps persisted: ");
|
|
|
|
r.append(f.user_defined_timestamps_persisted ? "true" : "false");
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto& blob_file_addition : blob_file_additions_) {
|
|
|
|
r.append("\n BlobFileAddition: ");
|
|
|
|
r.append(blob_file_addition.DebugString());
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto& blob_file_garbage : blob_file_garbages_) {
|
|
|
|
r.append("\n BlobFileGarbage: ");
|
|
|
|
r.append(blob_file_garbage.DebugString());
|
|
|
|
}
|
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
for (const auto& wal_addition : wal_additions_) {
|
|
|
|
r.append("\n WalAddition: ");
|
|
|
|
r.append(wal_addition.DebugString());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!wal_deletion_.IsEmpty()) {
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
r.append("\n WalDeletion: ");
|
|
|
|
r.append(wal_deletion_.DebugString());
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
}
|
|
|
|
|
|
|
|
r.append("\n ColumnFamily: ");
|
|
|
|
AppendNumberTo(&r, column_family_);
|
|
|
|
if (is_column_family_add_) {
|
|
|
|
r.append("\n ColumnFamilyAdd: ");
|
|
|
|
r.append(column_family_name_);
|
|
|
|
}
|
|
|
|
if (is_column_family_drop_) {
|
|
|
|
r.append("\n ColumnFamilyDrop");
|
|
|
|
}
|
|
|
|
if (is_in_atomic_group_) {
|
|
|
|
r.append("\n AtomicGroup: ");
|
|
|
|
AppendNumberTo(&r, remaining_entries_);
|
|
|
|
r.append(" entries remains");
|
|
|
|
}
|
|
|
|
if (HasFullHistoryTsLow()) {
|
|
|
|
r.append("\n FullHistoryTsLow: ");
|
|
|
|
r.append(Slice(full_history_ts_low_).ToString(hex_key));
|
|
|
|
}
|
|
|
|
r.append("\n}\n");
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
10 years ago
|
|
|
std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
|
|
|
|
JSONWriter jw;
|
|
|
|
jw << "EditNumber" << edit_num;
|
|
|
|
|
|
|
|
if (has_db_id_) {
|
|
|
|
jw << "DB ID" << db_id_;
|
|
|
|
}
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
10 years ago
|
|
|
if (has_comparator_) {
|
|
|
|
jw << "Comparator" << comparator_;
|
|
|
|
}
|
|
|
|
if (has_log_number_) {
|
|
|
|
jw << "LogNumber" << log_number_;
|
|
|
|
}
|
|
|
|
if (has_prev_log_number_) {
|
|
|
|
jw << "PrevLogNumber" << prev_log_number_;
|
|
|
|
}
|
|
|
|
if (has_next_file_number_) {
|
|
|
|
jw << "NextFileNumber" << next_file_number_;
|
|
|
|
}
|
|
|
|
if (has_max_column_family_) {
|
|
|
|
jw << "MaxColumnFamily" << max_column_family_;
|
|
|
|
}
|
|
|
|
if (has_min_log_number_to_keep_) {
|
|
|
|
jw << "MinLogNumberToKeep" << min_log_number_to_keep_;
|
|
|
|
}
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
10 years ago
|
|
|
if (has_last_sequence_) {
|
|
|
|
jw << "LastSeq" << last_sequence_;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!deleted_files_.empty()) {
|
|
|
|
jw << "DeletedFiles";
|
|
|
|
jw.StartArray();
|
|
|
|
|
|
|
|
for (const auto& deleted_file : deleted_files_) {
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
10 years ago
|
|
|
jw.StartArrayedObject();
|
|
|
|
jw << "Level" << deleted_file.first;
|
|
|
|
jw << "FileNumber" << deleted_file.second;
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
10 years ago
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!new_files_.empty()) {
|
|
|
|
jw << "AddedFiles";
|
|
|
|
jw.StartArray();
|
|
|
|
|
|
|
|
for (size_t i = 0; i < new_files_.size(); i++) {
|
|
|
|
jw.StartArrayedObject();
|
|
|
|
jw << "Level" << new_files_[i].first;
|
|
|
|
const FileMetaData& f = new_files_[i].second;
|
|
|
|
jw << "FileNumber" << f.fd.GetNumber();
|
|
|
|
jw << "FileSize" << f.fd.GetFileSize();
|
|
|
|
jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
|
|
|
|
jw << "LargestIKey" << f.largest.DebugString(hex_key);
|
|
|
|
jw << "OldestAncesterTime" << f.oldest_ancester_time;
|
|
|
|
jw << "FileCreationTime" << f.file_creation_time;
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
jw << "EpochNumber" << f.epoch_number;
|
|
|
|
jw << "FileChecksum" << Slice(f.file_checksum).ToString(true);
|
|
|
|
jw << "FileChecksumFuncName" << f.file_checksum_func_name;
|
|
|
|
if (f.temperature != Temperature::kUnknown) {
|
|
|
|
jw << "temperature" << std::to_string(static_cast<int>(f.temperature));
|
|
|
|
}
|
|
|
|
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
|
|
|
|
jw << "OldestBlobFile" << f.oldest_blob_file_number;
|
|
|
|
}
|
|
|
|
if (f.temperature != Temperature::kUnknown) {
|
|
|
|
// Maybe change to human readable format whenthe feature becomes
|
|
|
|
// permanent
|
|
|
|
jw << "Temperature" << static_cast<int>(f.temperature);
|
|
|
|
}
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2 years ago
|
|
|
jw << "TailSize" << f.tail_size;
|
|
|
|
jw << "UserDefinedTimestampsPersisted"
|
|
|
|
<< f.user_defined_timestamps_persisted;
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
10 years ago
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!blob_file_additions_.empty()) {
|
|
|
|
jw << "BlobFileAdditions";
|
|
|
|
|
|
|
|
jw.StartArray();
|
|
|
|
|
|
|
|
for (const auto& blob_file_addition : blob_file_additions_) {
|
|
|
|
jw.StartArrayedObject();
|
|
|
|
jw << blob_file_addition;
|
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!blob_file_garbages_.empty()) {
|
|
|
|
jw << "BlobFileGarbages";
|
|
|
|
|
|
|
|
jw.StartArray();
|
|
|
|
|
|
|
|
for (const auto& blob_file_garbage : blob_file_garbages_) {
|
|
|
|
jw.StartArrayedObject();
|
|
|
|
jw << blob_file_garbage;
|
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
if (!wal_additions_.empty()) {
|
|
|
|
jw << "WalAdditions";
|
|
|
|
|
|
|
|
jw.StartArray();
|
|
|
|
|
|
|
|
for (const auto& wal_addition : wal_additions_) {
|
|
|
|
jw.StartArrayedObject();
|
|
|
|
jw << wal_addition;
|
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!wal_deletion_.IsEmpty()) {
|
|
|
|
jw << "WalDeletion";
|
|
|
|
jw.StartObject();
|
|
|
|
jw << wal_deletion_;
|
|
|
|
jw.EndObject();
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
4 years ago
|
|
|
}
|
|
|
|
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
10 years ago
|
|
|
jw << "ColumnFamily" << column_family_;
|
|
|
|
|
|
|
|
if (is_column_family_add_) {
|
|
|
|
jw << "ColumnFamilyAdd" << column_family_name_;
|
|
|
|
}
|
|
|
|
if (is_column_family_drop_) {
|
|
|
|
jw << "ColumnFamilyDrop" << column_family_name_;
|
|
|
|
}
|
|
|
|
if (is_in_atomic_group_) {
|
|
|
|
jw << "AtomicGroup" << remaining_entries_;
|
|
|
|
}
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
10 years ago
|
|
|
|
|
|
|
if (HasFullHistoryTsLow()) {
|
|
|
|
jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key);
|
|
|
|
}
|
|
|
|
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
10 years ago
|
|
|
jw.EndObject();
|
|
|
|
|
|
|
|
return jw.Get();
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|