|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
Integrated blob garbage collection: relocate blobs (#7694)
Summary:
The patch adds basic garbage collection support to the integrated BlobDB
implementation. Valid blobs residing in the oldest blob files are relocated
as they are encountered during compaction. The threshold that determines
which blob files qualify is computed based on the configuration option
`blob_garbage_collection_age_cutoff`, which was introduced in https://github.com/facebook/rocksdb/issues/7661 .
Once a blob is retrieved for the purposes of relocation, it passes through the
same logic that extracts large values to blob files in general. This means that
if, for instance, the size threshold for key-value separation (`min_blob_size`)
got changed or writing blob files got disabled altogether, it is possible for the
value to be moved back into the LSM tree. In particular, one way to re-inline
all blob values if needed would be to perform a full manual compaction with
`enable_blob_files` set to `false`, `enable_blob_garbage_collection` set to
`true`, and `blob_file_garbage_collection_age_cutoff` set to `1.0`.
Some TODOs that I plan to address in separate PRs:
1) We'll have to measure the amount of new garbage in each blob file and log
`BlobFileGarbage` entries as part of the compaction job's `VersionEdit`.
(For the time being, blob files are cleaned up solely based on the
`oldest_blob_file_number` relationships.)
2) When compression is used for blobs, the compression type hasn't changed,
and the blob still qualifies for being written to a blob file, we can simply copy
the compressed blob to the new file instead of going through decompression
and compression.
3) We need to update the formula for computing write amplification to account
for the amount of data read from blob files as part of GC.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7694
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D25069663
Pulled By: ltamasi
fbshipit-source-id: bdfa8feb09afcf5bca3b4eba2ba72ce2f15cd06a
4 years ago
|
|
|
#include <tuple>
|
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
#include "compaction/compaction_picker_universal.h"
|
Integrated blob garbage collection: relocate blobs (#7694)
Summary:
The patch adds basic garbage collection support to the integrated BlobDB
implementation. Valid blobs residing in the oldest blob files are relocated
as they are encountered during compaction. The threshold that determines
which blob files qualify is computed based on the configuration option
`blob_garbage_collection_age_cutoff`, which was introduced in https://github.com/facebook/rocksdb/issues/7661 .
Once a blob is retrieved for the purposes of relocation, it passes through the
same logic that extracts large values to blob files in general. This means that
if, for instance, the size threshold for key-value separation (`min_blob_size`)
got changed or writing blob files got disabled altogether, it is possible for the
value to be moved back into the LSM tree. In particular, one way to re-inline
all blob values if needed would be to perform a full manual compaction with
`enable_blob_files` set to `false`, `enable_blob_garbage_collection` set to
`true`, and `blob_file_garbage_collection_age_cutoff` set to `1.0`.
Some TODOs that I plan to address in separate PRs:
1) We'll have to measure the amount of new garbage in each blob file and log
`BlobFileGarbage` entries as part of the compaction job's `VersionEdit`.
(For the time being, blob files are cleaned up solely based on the
`oldest_blob_file_number` relationships.)
2) When compression is used for blobs, the compression type hasn't changed,
and the blob still qualifies for being written to a blob file, we can simply copy
the compressed blob to the new file instead of going through decompression
and compression.
3) We need to update the formula for computing write amplification to account
for the amount of data read from blob files as part of GC.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7694
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D25069663
Pulled By: ltamasi
fbshipit-source-id: bdfa8feb09afcf5bca3b4eba2ba72ce2f15cd06a
4 years ago
|
|
|
#include "db/blob/blob_index.h"
|
|
|
|
#include "db/db_test_util.h"
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "env/mock_env.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "port/stack_trace.h"
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
#include "rocksdb/concurrent_task_limiter.h"
|
|
|
|
#include "rocksdb/experimental.h"
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
#include "rocksdb/sst_file_writer.h"
|
|
|
|
#include "rocksdb/utilities/convenience.h"
|
|
|
|
#include "test_util/sync_point.h"
|
|
|
|
#include "test_util/testutil.h"
|
|
|
|
#include "util/concurrent_task_limiter_impl.h"
|
|
|
|
#include "util/random.h"
|
|
|
|
#include "utilities/fault_injection_env.h"
|
|
|
|
#include "utilities/fault_injection_fs.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
// SYNC_POINT is not supported in released Windows mode.
|
|
|
|
#if !defined(ROCKSDB_LITE)
|
|
|
|
|
|
|
|
class CompactionStatsCollector : public EventListener {
|
|
|
|
public:
|
|
|
|
CompactionStatsCollector()
|
|
|
|
: compaction_completed_(
|
|
|
|
static_cast<int>(CompactionReason::kNumOfReasons)) {
|
|
|
|
for (auto& v : compaction_completed_) {
|
|
|
|
v.store(0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
~CompactionStatsCollector() override {}
|
|
|
|
|
|
|
|
void OnCompactionCompleted(DB* /* db */,
|
|
|
|
const CompactionJobInfo& info) override {
|
|
|
|
int k = static_cast<int>(info.compaction_reason);
|
|
|
|
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
|
|
|
assert(k >= 0 && k < num_of_reasons);
|
|
|
|
compaction_completed_[k]++;
|
|
|
|
}
|
|
|
|
|
|
|
|
void OnExternalFileIngested(
|
|
|
|
DB* /* db */, const ExternalFileIngestionInfo& /* info */) override {
|
|
|
|
int k = static_cast<int>(CompactionReason::kExternalSstIngestion);
|
|
|
|
compaction_completed_[k]++;
|
|
|
|
}
|
|
|
|
|
|
|
|
void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override {
|
|
|
|
int k = static_cast<int>(CompactionReason::kFlush);
|
|
|
|
compaction_completed_[k]++;
|
|
|
|
}
|
|
|
|
|
|
|
|
int NumberOfCompactions(CompactionReason reason) const {
|
|
|
|
int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
|
|
|
|
int k = static_cast<int>(reason);
|
|
|
|
assert(k >= 0 && k < num_of_reasons);
|
|
|
|
return compaction_completed_.at(k).load();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::vector<std::atomic<int>> compaction_completed_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class DBCompactionTest : public DBTestBase {
|
|
|
|
public:
|
|
|
|
DBCompactionTest()
|
|
|
|
: DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {}
|
|
|
|
|
|
|
|
protected:
|
|
|
|
/*
|
|
|
|
* Verifies compaction stats of cfd are valid.
|
|
|
|
*
|
|
|
|
* For each level of cfd, its compaction stats are valid if
|
|
|
|
* 1) sum(stat.counts) == stat.count, and
|
|
|
|
* 2) stat.counts[i] == collector.NumberOfCompactions(i)
|
|
|
|
*/
|
|
|
|
void VerifyCompactionStats(ColumnFamilyData& cfd,
|
|
|
|
const CompactionStatsCollector& collector) {
|
|
|
|
#ifndef NDEBUG
|
|
|
|
InternalStats* internal_stats_ptr = cfd.internal_stats();
|
|
|
|
ASSERT_NE(internal_stats_ptr, nullptr);
|
|
|
|
const std::vector<InternalStats::CompactionStats>& comp_stats =
|
|
|
|
internal_stats_ptr->TEST_GetCompactionStats();
|
|
|
|
const int num_of_reasons =
|
|
|
|
static_cast<int>(CompactionReason::kNumOfReasons);
|
|
|
|
std::vector<int> counts(num_of_reasons, 0);
|
|
|
|
// Count the number of compactions caused by each CompactionReason across
|
|
|
|
// all levels.
|
|
|
|
for (const auto& stat : comp_stats) {
|
|
|
|
int sum = 0;
|
|
|
|
for (int i = 0; i < num_of_reasons; i++) {
|
|
|
|
counts[i] += stat.counts[i];
|
|
|
|
sum += stat.counts[i];
|
|
|
|
}
|
|
|
|
ASSERT_EQ(sum, stat.count);
|
|
|
|
}
|
|
|
|
// Verify InternalStats bookkeeping matches that of
|
|
|
|
// CompactionStatsCollector, assuming that all compactions complete.
|
|
|
|
for (int i = 0; i < num_of_reasons; i++) {
|
|
|
|
ASSERT_EQ(collector.NumberOfCompactions(static_cast<CompactionReason>(i)),
|
|
|
|
counts[i]);
|
|
|
|
}
|
|
|
|
#endif /* NDEBUG */
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
class DBCompactionTestWithParam
|
|
|
|
: public DBTestBase,
|
|
|
|
public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
|
|
|
|
public:
|
|
|
|
DBCompactionTestWithParam()
|
|
|
|
: DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
max_subcompactions_ = std::get<0>(GetParam());
|
|
|
|
exclusive_manual_compaction_ = std::get<1>(GetParam());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Required if inheriting from testing::WithParamInterface<>
|
|
|
|
static void SetUpTestCase() {}
|
|
|
|
static void TearDownTestCase() {}
|
|
|
|
|
|
|
|
uint32_t max_subcompactions_;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
bool exclusive_manual_compaction_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class DBCompactionTestWithBottommostParam
|
|
|
|
: public DBTestBase,
|
|
|
|
public testing::WithParamInterface<BottommostLevelCompaction> {
|
|
|
|
public:
|
|
|
|
DBCompactionTestWithBottommostParam()
|
|
|
|
: DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {
|
|
|
|
bottommost_level_compaction_ = GetParam();
|
|
|
|
}
|
|
|
|
|
|
|
|
BottommostLevelCompaction bottommost_level_compaction_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class DBCompactionDirectIOTest : public DBCompactionTest,
|
|
|
|
public ::testing::WithParamInterface<bool> {
|
|
|
|
public:
|
|
|
|
DBCompactionDirectIOTest() : DBCompactionTest() {}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Param = true : target level is non-empty
|
|
|
|
// Param = false: level between target level and source level
|
|
|
|
// is not empty.
|
|
|
|
class ChangeLevelConflictsWithAuto
|
|
|
|
: public DBCompactionTest,
|
|
|
|
public ::testing::WithParamInterface<bool> {
|
|
|
|
public:
|
|
|
|
ChangeLevelConflictsWithAuto() : DBCompactionTest() {}
|
|
|
|
};
|
|
|
|
|
Support subcmpct using reserved resources for round-robin priority (#10341)
Summary:
Earlier implementation of round-robin priority can only pick one file at a time and disallows parallel compactions within the same level. In this PR, round-robin compaction policy will expand towards more input files with respecting some additional constraints, which are summarized as follows:
* Constraint 1: We can only pick consecutive files
- Constraint 1a: When a file is being compacted (or some input files are being compacted after expanding), we cannot choose it and have to stop choosing more files
- Constraint 1b: When we reach the last file (with the largest keys), we cannot choose more files (the next file will be the first one with small keys)
* Constraint 2: We should ensure the total compaction bytes (including the overlapped files from the next level) is no more than `mutable_cf_options_.max_compaction_bytes`
* Constraint 3: We try our best to pick as many files as possible so that the post-compaction level size can be just less than `MaxBytesForLevel(start_level_)`
* Constraint 4: If trivial move is allowed, we reuse the logic of `TryNonL0TrivialMove()` instead of expanding files with Constraint 3
More details can be found in `LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion()`.
The above optimization accelerates the process of moving the compaction cursor, in which the write-amp can be further reduced. While a large compaction may lead to high write stall, we break this large compaction into several subcompactions **regardless of** the `max_subcompactions` limit. The number of subcompactions for round-robin compaction priority is determined through the following steps:
* Step 1: Initialized against `max_output_file_limit`, the number of input files in the start level, and also the range size limit `ranges.size()`
* Step 2: Call `AcquireSubcompactionResources()`when max subcompactions is not sufficient, but we may or may not obtain desired resources, additional number of resources is stored in `extra_num_subcompaction_threads_reserved_`). Subcompaction limit is changed and update `num_planned_subcompactions` with `GetSubcompactionLimit()`
* Step 3: Call `ShrinkSubcompactionResources()` to ensure extra resources can be released (extra resources may exist for round-robin compaction when the number of actual number of subcompactions is less than the number of planned subcompactions)
More details can be found in `CompactionJob::AcquireSubcompactionResources()`,`CompactionJob::ShrinkSubcompactionResources()`, and `CompactionJob::ReleaseSubcompactionResources()`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10341
Test Plan: Add `CompactionPriMultipleFilesRoundRobin[1-3]` unit test in `compaction_picker_test.cc` and `RoundRobinSubcompactionsAgainstResources.SubcompactionsUsingResources/[0-4]`, `RoundRobinSubcompactionsAgainstPressureToken.PressureTokenTest/[0-1]` in `db_compaction_test.cc`
Reviewed By: ajkr, hx235
Differential Revision: D37792644
Pulled By: littlepig2013
fbshipit-source-id: 7fecb7c4ffd97b34bbf6e3b760b2c35a772a0657
2 years ago
|
|
|
// Param = true: grab the compaction pressure token (enable
|
|
|
|
// parallel compactions)
|
|
|
|
// Param = false: Not grab the token (no parallel compactions)
|
|
|
|
class RoundRobinSubcompactionsAgainstPressureToken
|
|
|
|
: public DBCompactionTest,
|
|
|
|
public ::testing::WithParamInterface<bool> {
|
|
|
|
public:
|
|
|
|
RoundRobinSubcompactionsAgainstPressureToken() {
|
|
|
|
grab_pressure_token_ = GetParam();
|
|
|
|
}
|
|
|
|
bool grab_pressure_token_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class RoundRobinSubcompactionsAgainstResources
|
|
|
|
: public DBCompactionTest,
|
|
|
|
public ::testing::WithParamInterface<std::tuple<int, int>> {
|
|
|
|
public:
|
|
|
|
RoundRobinSubcompactionsAgainstResources() {
|
|
|
|
total_low_pri_threads_ = std::get<0>(GetParam());
|
|
|
|
max_compaction_limits_ = std::get<1>(GetParam());
|
|
|
|
}
|
|
|
|
int total_low_pri_threads_;
|
|
|
|
int max_compaction_limits_;
|
|
|
|
};
|
|
|
|
|
Support subcmpct using reserved resources for round-robin priority (#10341)
Summary:
Earlier implementation of round-robin priority can only pick one file at a time and disallows parallel compactions within the same level. In this PR, round-robin compaction policy will expand towards more input files with respecting some additional constraints, which are summarized as follows:
* Constraint 1: We can only pick consecutive files
- Constraint 1a: When a file is being compacted (or some input files are being compacted after expanding), we cannot choose it and have to stop choosing more files
- Constraint 1b: When we reach the last file (with the largest keys), we cannot choose more files (the next file will be the first one with small keys)
* Constraint 2: We should ensure the total compaction bytes (including the overlapped files from the next level) is no more than `mutable_cf_options_.max_compaction_bytes`
* Constraint 3: We try our best to pick as many files as possible so that the post-compaction level size can be just less than `MaxBytesForLevel(start_level_)`
* Constraint 4: If trivial move is allowed, we reuse the logic of `TryNonL0TrivialMove()` instead of expanding files with Constraint 3
More details can be found in `LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion()`.
The above optimization accelerates the process of moving the compaction cursor, in which the write-amp can be further reduced. While a large compaction may lead to high write stall, we break this large compaction into several subcompactions **regardless of** the `max_subcompactions` limit. The number of subcompactions for round-robin compaction priority is determined through the following steps:
* Step 1: Initialized against `max_output_file_limit`, the number of input files in the start level, and also the range size limit `ranges.size()`
* Step 2: Call `AcquireSubcompactionResources()`when max subcompactions is not sufficient, but we may or may not obtain desired resources, additional number of resources is stored in `extra_num_subcompaction_threads_reserved_`). Subcompaction limit is changed and update `num_planned_subcompactions` with `GetSubcompactionLimit()`
* Step 3: Call `ShrinkSubcompactionResources()` to ensure extra resources can be released (extra resources may exist for round-robin compaction when the number of actual number of subcompactions is less than the number of planned subcompactions)
More details can be found in `CompactionJob::AcquireSubcompactionResources()`,`CompactionJob::ShrinkSubcompactionResources()`, and `CompactionJob::ReleaseSubcompactionResources()`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10341
Test Plan: Add `CompactionPriMultipleFilesRoundRobin[1-3]` unit test in `compaction_picker_test.cc` and `RoundRobinSubcompactionsAgainstResources.SubcompactionsUsingResources/[0-4]`, `RoundRobinSubcompactionsAgainstPressureToken.PressureTokenTest/[0-1]` in `db_compaction_test.cc`
Reviewed By: ajkr, hx235
Differential Revision: D37792644
Pulled By: littlepig2013
fbshipit-source-id: 7fecb7c4ffd97b34bbf6e3b760b2c35a772a0657
2 years ago
|
|
|
namespace {
|
|
|
|
class FlushedFileCollector : public EventListener {
|
|
|
|
public:
|
|
|
|
FlushedFileCollector() {}
|
|
|
|
~FlushedFileCollector() override {}
|
|
|
|
|
|
|
|
void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
|
|
|
|
std::lock_guard<std::mutex> lock(mutex_);
|
|
|
|
flushed_files_.push_back(info.file_path);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> GetFlushedFiles() {
|
|
|
|
std::lock_guard<std::mutex> lock(mutex_);
|
|
|
|
std::vector<std::string> result;
|
|
|
|
for (auto fname : flushed_files_) {
|
|
|
|
result.push_back(fname);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ClearFlushedFiles() { flushed_files_.clear(); }
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::vector<std::string> flushed_files_;
|
|
|
|
std::mutex mutex_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class SstStatsCollector : public EventListener {
|
|
|
|
public:
|
|
|
|
SstStatsCollector() : num_ssts_creation_started_(0) {}
|
|
|
|
|
|
|
|
void OnTableFileCreationStarted(
|
|
|
|
const TableFileCreationBriefInfo& /* info */) override {
|
|
|
|
++num_ssts_creation_started_;
|
|
|
|
}
|
|
|
|
|
|
|
|
int num_ssts_creation_started() { return num_ssts_creation_started_; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::atomic<int> num_ssts_creation_started_;
|
|
|
|
};
|
|
|
|
|
|
|
|
static const int kCDTValueSize = 1000;
|
|
|
|
static const int kCDTKeysPerBuffer = 4;
|
|
|
|
static const int kCDTNumLevels = 8;
|
|
|
|
Options DeletionTriggerOptions(Options options) {
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24);
|
|
|
|
options.min_write_buffer_number_to_merge = 1;
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
options.max_write_buffer_size_to_maintain = 0;
|
|
|
|
options.num_levels = kCDTNumLevels;
|
|
|
|
options.level0_file_num_compaction_trigger = 1;
|
|
|
|
options.target_file_size_base = options.write_buffer_size * 2;
|
|
|
|
options.target_file_size_multiplier = 2;
|
|
|
|
options.max_bytes_for_level_base =
|
|
|
|
options.target_file_size_base * options.target_file_size_multiplier;
|
|
|
|
options.max_bytes_for_level_multiplier = 2;
|
|
|
|
options.disable_auto_compactions = false;
|
|
|
|
options.compaction_options_universal.max_size_amplification_percent = 100;
|
|
|
|
return options;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
|
|
|
|
const SstFileMetaData& b) {
|
|
|
|
if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) {
|
|
|
|
if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
|
|
|
|
// b.smallestkey <= a.smallestkey <= b.largestkey
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
|
|
|
|
// a.smallestkey < b.smallestkey <= a.largestkey
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) {
|
|
|
|
if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
|
|
|
|
// b.smallestkey <= a.largestkey <= b.largestkey
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
|
|
|
|
// a.smallestkey <= b.largestkey < a.largestkey
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Identifies all files between level "min_level" and "max_level"
|
|
|
|
// which has overlapping key range with "input_file_meta".
|
|
|
|
void GetOverlappingFileNumbersForLevelCompaction(
|
|
|
|
const ColumnFamilyMetaData& cf_meta, const Comparator* comparator,
|
|
|
|
int min_level, int max_level, const SstFileMetaData* input_file_meta,
|
|
|
|
std::set<std::string>* overlapping_file_names) {
|
|
|
|
std::set<const SstFileMetaData*> overlapping_files;
|
|
|
|
overlapping_files.insert(input_file_meta);
|
|
|
|
for (int m = min_level; m <= max_level; ++m) {
|
|
|
|
for (auto& file : cf_meta.levels[m].files) {
|
|
|
|
for (auto* included_file : overlapping_files) {
|
|
|
|
if (HaveOverlappingKeyRanges(comparator, *included_file, file)) {
|
|
|
|
overlapping_files.insert(&file);
|
|
|
|
overlapping_file_names->insert(file.name);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void VerifyCompactionResult(
|
|
|
|
const ColumnFamilyMetaData& cf_meta,
|
|
|
|
const std::set<std::string>& overlapping_file_numbers) {
|
|
|
|
#ifndef NDEBUG
|
|
|
|
for (auto& level : cf_meta.levels) {
|
|
|
|
for (auto& file : level.files) {
|
|
|
|
assert(overlapping_file_numbers.find(file.name) ==
|
|
|
|
overlapping_file_numbers.end());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
const SstFileMetaData* PickFileRandomly(const ColumnFamilyMetaData& cf_meta,
|
|
|
|
Random* rand, int* level = nullptr) {
|
|
|
|
auto file_id = rand->Uniform(static_cast<int>(cf_meta.file_count)) + 1;
|
|
|
|
for (auto& level_meta : cf_meta.levels) {
|
|
|
|
if (file_id <= level_meta.files.size()) {
|
|
|
|
if (level != nullptr) {
|
|
|
|
*level = level_meta.level;
|
|
|
|
}
|
|
|
|
auto result = rand->Uniform(file_id);
|
|
|
|
return &(level_meta.files[result]);
|
|
|
|
}
|
|
|
|
file_id -= static_cast<uint32_t>(level_meta.files.size());
|
|
|
|
}
|
|
|
|
assert(false);
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
|
|
|
|
#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
// All the TEST_P tests run once with sub_compactions disabled (i.e.
|
|
|
|
// options.max_subcompactions = 1) and once with it enabled
|
|
|
|
TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
|
|
|
|
for (int tid = 0; tid < 3; ++tid) {
|
|
|
|
uint64_t db_size[2];
|
|
|
|
Options options = DeletionTriggerOptions(CurrentOptions());
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
|
|
|
|
if (tid == 1) {
|
|
|
|
// the following only disable stats update in DB::Open()
|
|
|
|
// and should not affect the result of this test.
|
|
|
|
options.skip_stats_update_on_db_open = true;
|
|
|
|
} else if (tid == 2) {
|
|
|
|
// third pass with universal compaction
|
|
|
|
options.compaction_style = kCompactionStyleUniversal;
|
|
|
|
options.num_levels = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
const int kTestSize = kCDTKeysPerBuffer * 1024;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
for (int k = 0; k < kTestSize; ++k) {
|
|
|
|
values.push_back(rnd.RandomString(kCDTValueSize));
|
|
|
|
ASSERT_OK(Put(Key(k), values[k]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
|
|
|
|
|
|
|
|
for (int k = 0; k < kTestSize; ++k) {
|
|
|
|
ASSERT_OK(Delete(Key(k)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
|
|
|
|
|
|
|
|
if (options.compaction_style == kCompactionStyleUniversal) {
|
|
|
|
// Claim: in universal compaction none of the original data will remain
|
|
|
|
// once compactions settle.
|
|
|
|
//
|
|
|
|
// Proof: The compensated size of the file containing the most tombstones
|
|
|
|
// is enough on its own to trigger size amp compaction. Size amp
|
|
|
|
// compaction is a full compaction, so all tombstones meet the obsolete
|
|
|
|
// keys they cover.
|
|
|
|
ASSERT_EQ(0, db_size[1]);
|
|
|
|
} else {
|
|
|
|
// Claim: in level compaction at most `db_size[0] / 2` of the original
|
|
|
|
// data will remain once compactions settle.
|
|
|
|
//
|
|
|
|
// Proof: Assume the original data is all in the bottom level. If it were
|
|
|
|
// not, it would meet its tombstone sooner. The original data size is
|
|
|
|
// large enough to require fanout to bottom level to be greater than
|
|
|
|
// `max_bytes_for_level_multiplier == 2`. In the level just above,
|
|
|
|
// tombstones must cover less than `db_size[0] / 4` bytes since fanout >=
|
|
|
|
// 2 and file size is compensated by doubling the size of values we expect
|
|
|
|
// are covered (`kDeletionWeightOnCompaction == 2`). The tombstones in
|
|
|
|
// levels above must cover less than `db_size[0] / 8` bytes of original
|
|
|
|
// data, `db_size[0] / 16`, and so on.
|
|
|
|
ASSERT_GT(db_size[0] / 2, db_size[1]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
|
|
|
|
// This test verify UpdateAccumulatedStats is not on
|
|
|
|
// if options.skip_stats_update_on_db_open = true
|
|
|
|
// The test will need to be updated if the internal behavior changes.
|
|
|
|
|
|
|
|
Options options = DeletionTriggerOptions(CurrentOptions());
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.env = env_;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
const int kTestSize = kCDTKeysPerBuffer * 512;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
for (int k = 0; k < kTestSize; ++k) {
|
|
|
|
values.push_back(rnd.RandomString(kCDTValueSize));
|
|
|
|
ASSERT_OK(Put(Key(k), values[k]));
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
Close();
|
|
|
|
|
|
|
|
int update_acc_stats_called = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionStorageInfo::UpdateAccumulatedStats",
|
|
|
|
[&](void* /* arg */) { ++update_acc_stats_called; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// Reopen the DB with stats-update disabled
|
|
|
|
options.skip_stats_update_on_db_open = true;
|
|
|
|
options.max_open_files = 20;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
ASSERT_EQ(update_acc_stats_called, 0);
|
|
|
|
|
|
|
|
// Repeat the reopen process, but this time we enable
|
|
|
|
// stats-update.
|
|
|
|
options.skip_stats_update_on_db_open = false;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
ASSERT_GT(update_acc_stats_called, 0);
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.max_open_files = 20;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
New Cache API for gathering statistics (#8225)
Summary:
Adds a new Cache::ApplyToAllEntries API that we expect to use
(in follow-up PRs) for efficiently gathering block cache statistics.
Notable features vs. old ApplyToAllCacheEntries:
* Includes key and deleter (in addition to value and charge). We could
have passed in a Handle but then more virtual function calls would be
needed to get the "fields" of each entry. We expect to use the 'deleter'
to identify the origin of entries, perhaps even more.
* Heavily tuned to minimize latency impact on operating cache. It
does this by iterating over small sections of each cache shard while
cycling through the shards.
* Supports tuning roughly how many entries to operate on for each
lock acquire and release, to control the impact on the latency of other
operations without excessive lock acquire & release. The right balance
can depend on the cost of the callback. Good default seems to be
around 256.
* There should be no need to disable thread safety. (I would expect
uncontended locks to be sufficiently fast.)
I have enhanced cache_bench to validate this approach:
* Reports a histogram of ns per operation, so we can look at the
ditribution of times, not just throughput (average).
* Can add a thread for simulated "gather stats" which calls
ApplyToAllEntries at a specified interval. We also generate a histogram
of time to run ApplyToAllEntries.
To make the iteration over some entries of each shard work as cleanly as
possible, even with resize between next set of entries, I have
re-arranged which hash bits are used for sharding and which for indexing
within a shard.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8225
Test Plan:
A couple of unit tests are added, but primary validation is manual, as
the primary risk is to performance.
The primary validation is using cache_bench to ensure that neither
the minor hashing changes nor the simulated stats gathering
significantly impact QPS or latency distribution. Note that adding op
latency histogram seriously impacts the benchmark QPS, so for a
fair baseline, we need the cache_bench changes (except remove simulated
stat gathering to make it compile). In short, we don't see any
reproducible difference in ops/sec or op latency unless we are gathering
stats nearly continuously. Test uses 10GB block cache with
8KB values to be somewhat realistic in the number of items to iterate
over.
Baseline typical output:
```
Complete in 92.017 s; Rough parallel ops/sec = 869401
Thread ops/sec = 54662
Operation latency (ns):
Count: 80000000 Average: 11223.9494 StdDev: 29.61
Min: 0 Median: 7759.3973 Max: 9620500
Percentiles: P50: 7759.40 P75: 14190.73 P99: 46922.75 P99.9: 77509.84 P99.99: 217030.58
------------------------------------------------------
[ 0, 1 ] 68 0.000% 0.000%
( 2900, 4400 ] 89 0.000% 0.000%
( 4400, 6600 ] 33630240 42.038% 42.038% ########
( 6600, 9900 ] 18129842 22.662% 64.700% #####
( 9900, 14000 ] 7877533 9.847% 74.547% ##
( 14000, 22000 ] 15193238 18.992% 93.539% ####
( 22000, 33000 ] 3037061 3.796% 97.335% #
( 33000, 50000 ] 1626316 2.033% 99.368%
( 50000, 75000 ] 421532 0.527% 99.895%
( 75000, 110000 ] 56910 0.071% 99.966%
( 110000, 170000 ] 16134 0.020% 99.986%
( 170000, 250000 ] 5166 0.006% 99.993%
( 250000, 380000 ] 3017 0.004% 99.996%
( 380000, 570000 ] 1337 0.002% 99.998%
( 570000, 860000 ] 805 0.001% 99.999%
( 860000, 1200000 ] 319 0.000% 100.000%
( 1200000, 1900000 ] 231 0.000% 100.000%
( 1900000, 2900000 ] 100 0.000% 100.000%
( 2900000, 4300000 ] 39 0.000% 100.000%
( 4300000, 6500000 ] 16 0.000% 100.000%
( 6500000, 9800000 ] 7 0.000% 100.000%
```
New, gather_stats=false. Median thread ops/sec of 5 runs:
```
Complete in 92.030 s; Rough parallel ops/sec = 869285
Thread ops/sec = 54458
Operation latency (ns):
Count: 80000000 Average: 11298.1027 StdDev: 42.18
Min: 0 Median: 7722.0822 Max: 6398720
Percentiles: P50: 7722.08 P75: 14294.68 P99: 47522.95 P99.9: 85292.16 P99.99: 228077.78
------------------------------------------------------
[ 0, 1 ] 109 0.000% 0.000%
( 2900, 4400 ] 793 0.001% 0.001%
( 4400, 6600 ] 34054563 42.568% 42.569% #########
( 6600, 9900 ] 17482646 21.853% 64.423% ####
( 9900, 14000 ] 7908180 9.885% 74.308% ##
( 14000, 22000 ] 15032072 18.790% 93.098% ####
( 22000, 33000 ] 3237834 4.047% 97.145% #
( 33000, 50000 ] 1736882 2.171% 99.316%
( 50000, 75000 ] 446851 0.559% 99.875%
( 75000, 110000 ] 68251 0.085% 99.960%
( 110000, 170000 ] 18592 0.023% 99.983%
( 170000, 250000 ] 7200 0.009% 99.992%
( 250000, 380000 ] 3334 0.004% 99.997%
( 380000, 570000 ] 1393 0.002% 99.998%
( 570000, 860000 ] 700 0.001% 99.999%
( 860000, 1200000 ] 293 0.000% 100.000%
( 1200000, 1900000 ] 196 0.000% 100.000%
( 1900000, 2900000 ] 69 0.000% 100.000%
( 2900000, 4300000 ] 32 0.000% 100.000%
( 4300000, 6500000 ] 10 0.000% 100.000%
```
New, gather_stats=true, 1 second delay between scans. Scans take about
1 second here so it's spending about 50% time scanning. Still the effect on
ops/sec and latency seems to be in the noise. Median thread ops/sec of 5 runs:
```
Complete in 91.890 s; Rough parallel ops/sec = 870608
Thread ops/sec = 54551
Operation latency (ns):
Count: 80000000 Average: 11311.2629 StdDev: 45.28
Min: 0 Median: 7686.5458 Max: 10018340
Percentiles: P50: 7686.55 P75: 14481.95 P99: 47232.60 P99.9: 79230.18 P99.99: 232998.86
------------------------------------------------------
[ 0, 1 ] 71 0.000% 0.000%
( 2900, 4400 ] 291 0.000% 0.000%
( 4400, 6600 ] 34492060 43.115% 43.116% #########
( 6600, 9900 ] 16727328 20.909% 64.025% ####
( 9900, 14000 ] 7845828 9.807% 73.832% ##
( 14000, 22000 ] 15510654 19.388% 93.220% ####
( 22000, 33000 ] 3216533 4.021% 97.241% #
( 33000, 50000 ] 1680859 2.101% 99.342%
( 50000, 75000 ] 439059 0.549% 99.891%
( 75000, 110000 ] 60540 0.076% 99.967%
( 110000, 170000 ] 14649 0.018% 99.985%
( 170000, 250000 ] 5242 0.007% 99.991%
( 250000, 380000 ] 3260 0.004% 99.995%
( 380000, 570000 ] 1599 0.002% 99.997%
( 570000, 860000 ] 1043 0.001% 99.999%
( 860000, 1200000 ] 471 0.001% 99.999%
( 1200000, 1900000 ] 275 0.000% 100.000%
( 1900000, 2900000 ] 143 0.000% 100.000%
( 2900000, 4300000 ] 60 0.000% 100.000%
( 4300000, 6500000 ] 27 0.000% 100.000%
( 6500000, 9800000 ] 7 0.000% 100.000%
( 9800000, 14000000 ] 1 0.000% 100.000%
Gather stats latency (us):
Count: 46 Average: 980387.5870 StdDev: 60911.18
Min: 879155 Median: 1033777.7778 Max: 1261431
Percentiles: P50: 1033777.78 P75: 1120666.67 P99: 1261431.00 P99.9: 1261431.00 P99.99: 1261431.00
------------------------------------------------------
( 860000, 1200000 ] 45 97.826% 97.826% ####################
( 1200000, 1900000 ] 1 2.174% 100.000%
Most recent cache entry stats:
Number of entries: 1295133
Total charge: 9.88 GB
Average key size: 23.4982
Average charge: 8.00 KB
Unique deleters: 3
```
Reviewed By: mrambacher
Differential Revision: D28295742
Pulled By: pdillinger
fbshipit-source-id: bbc4a552f91ba0fe10e5cc025c42cef5a81f2b95
4 years ago
|
|
|
// Avoid many shards with small max_open_files, where as little as
|
|
|
|
// two table insertions could lead to an LRU eviction, depending on
|
|
|
|
// hash values.
|
|
|
|
options.table_cache_numshardbits = 2;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
int num_table_cache_lookup = 0;
|
|
|
|
int num_new_table_reader = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"TableCache::FindTable:0", [&](void* arg) {
|
|
|
|
assert(arg != nullptr);
|
|
|
|
bool no_io = *(reinterpret_cast<bool*>(arg));
|
|
|
|
if (!no_io) {
|
|
|
|
// filter out cases for table properties queries.
|
|
|
|
num_table_cache_lookup++;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"TableCache::GetTableReader:0",
|
|
|
|
[&](void* /*arg*/) { num_new_table_reader++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
for (int k = 0; k < options.level0_file_num_compaction_trigger; ++k) {
|
|
|
|
ASSERT_OK(Put(Key(k), Key(k)));
|
|
|
|
ASSERT_OK(Put(Key(10 - k), "bar"));
|
|
|
|
if (k < options.level0_file_num_compaction_trigger - 1) {
|
|
|
|
num_table_cache_lookup = 0;
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
// preloading iterator issues one table cache lookup and create
|
|
|
|
// a new table reader, if not preloaded.
|
|
|
|
int old_num_table_cache_lookup = num_table_cache_lookup;
|
|
|
|
ASSERT_GE(num_table_cache_lookup, 1);
|
|
|
|
ASSERT_EQ(num_new_table_reader, 1);
|
|
|
|
|
|
|
|
num_table_cache_lookup = 0;
|
|
|
|
num_new_table_reader = 0;
|
|
|
|
ASSERT_EQ(Key(k), Get(Key(k)));
|
|
|
|
// lookup iterator from table cache and no need to create a new one.
|
|
|
|
ASSERT_EQ(old_num_table_cache_lookup + num_table_cache_lookup, 2);
|
|
|
|
ASSERT_EQ(num_new_table_reader, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
num_table_cache_lookup = 0;
|
|
|
|
num_new_table_reader = 0;
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
// Preloading iterator issues one table cache lookup and creates
|
|
|
|
// a new table reader. One file is created for flush and one for compaction.
|
|
|
|
// Compaction inputs make no table cache look-up for data/range deletion
|
|
|
|
// iterators
|
|
|
|
// May preload table cache too.
|
|
|
|
ASSERT_GE(num_table_cache_lookup, 2);
|
|
|
|
int old_num_table_cache_lookup2 = num_table_cache_lookup;
|
|
|
|
|
|
|
|
// Create new iterator for:
|
|
|
|
// (1) 1 for verifying flush results
|
|
|
|
// (2) 1 for verifying compaction results.
|
|
|
|
// (3) New TableReaders will not be created for compaction inputs
|
|
|
|
ASSERT_EQ(num_new_table_reader, 2);
|
|
|
|
|
|
|
|
num_table_cache_lookup = 0;
|
|
|
|
num_new_table_reader = 0;
|
|
|
|
ASSERT_EQ(Key(1), Get(Key(1)));
|
|
|
|
ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5);
|
|
|
|
ASSERT_EQ(num_new_table_reader, 0);
|
|
|
|
|
|
|
|
num_table_cache_lookup = 0;
|
|
|
|
num_new_table_reader = 0;
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = true;
|
|
|
|
cro.target_level = 2;
|
|
|
|
cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
// Only verifying compaction outputs issues one table cache lookup
|
|
|
|
// for both data block and range deletion block).
|
|
|
|
// May preload table cache too.
|
|
|
|
ASSERT_GE(num_table_cache_lookup, 1);
|
|
|
|
old_num_table_cache_lookup2 = num_table_cache_lookup;
|
|
|
|
// One for verifying compaction results.
|
|
|
|
// No new iterator created for compaction.
|
|
|
|
ASSERT_EQ(num_new_table_reader, 1);
|
|
|
|
|
|
|
|
num_table_cache_lookup = 0;
|
|
|
|
num_new_table_reader = 0;
|
|
|
|
ASSERT_EQ(Key(1), Get(Key(1)));
|
|
|
|
ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3);
|
|
|
|
ASSERT_EQ(num_new_table_reader, 0);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) {
|
|
|
|
for (int tid = 0; tid < 2; ++tid) {
|
|
|
|
uint64_t db_size[3];
|
|
|
|
Options options = DeletionTriggerOptions(CurrentOptions());
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
|
|
|
|
if (tid == 1) {
|
|
|
|
// second pass with universal compaction
|
|
|
|
options.compaction_style = kCompactionStyleUniversal;
|
|
|
|
options.num_levels = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
// round 1 --- insert key/value pairs.
|
|
|
|
const int kTestSize = kCDTKeysPerBuffer * 512;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
for (int k = 0; k < kTestSize; ++k) {
|
|
|
|
values.push_back(rnd.RandomString(kCDTValueSize));
|
|
|
|
ASSERT_OK(Put(Key(k), values[k]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
|
|
|
|
Close();
|
|
|
|
|
|
|
|
// round 2 --- disable auto-compactions and issue deletions.
|
|
|
|
options.create_if_missing = false;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
for (int k = 0; k < kTestSize; ++k) {
|
|
|
|
ASSERT_OK(Delete(Key(k)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
|
|
|
|
Close();
|
|
|
|
// as auto_compaction is off, we shouldn't see any reduction in db size.
|
|
|
|
ASSERT_LE(db_size[0], db_size[1]);
|
|
|
|
|
|
|
|
// round 3 --- reopen db with auto_compaction on and see if
|
|
|
|
// deletion compensation still work.
|
|
|
|
options.disable_auto_compactions = false;
|
|
|
|
Reopen(options);
|
|
|
|
// insert relatively small amount of data to trigger auto compaction.
|
|
|
|
for (int k = 0; k < kTestSize / 10; ++k) {
|
|
|
|
ASSERT_OK(Put(Key(k), values[k]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2]));
|
|
|
|
// this time we're expecting significant drop in size.
|
|
|
|
//
|
|
|
|
// See "CompactionDeletionTrigger" test for proof that at most
|
|
|
|
// `db_size[0] / 2` of the original data remains. In addition to that, this
|
|
|
|
// test inserts `db_size[0] / 10` to push the tombstones into SST files and
|
|
|
|
// then through automatic compactions. So in total `3 * db_size[0] / 5` of
|
|
|
|
// the original data may remain.
|
|
|
|
ASSERT_GT(3 * db_size[0] / 5, db_size[2]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactRangeBottomPri) {
|
|
|
|
ASSERT_OK(Put(Key(50), ""));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put(Key(100), ""));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put(Key(200), ""));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
{
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = true;
|
|
|
|
cro.target_level = 2;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
}
|
|
|
|
ASSERT_EQ("0,0,3", FilesPerLevel(0));
|
|
|
|
|
|
|
|
ASSERT_OK(Put(Key(1), ""));
|
|
|
|
ASSERT_OK(Put(Key(199), ""));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put(Key(2), ""));
|
|
|
|
ASSERT_OK(Put(Key(199), ""));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_EQ("2,0,3", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// Now we have 2 L0 files, and 3 L2 files, and a manual compaction will
|
|
|
|
// be triggered.
|
|
|
|
// Two compaction jobs will run. One compacts 2 L0 files in Low Pri Pool
|
|
|
|
// and one compact to L2 in bottom pri pool.
|
|
|
|
int low_pri_count = 0;
|
|
|
|
int bottom_pri_count = 0;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"ThreadPoolImpl::Impl::BGThread:BeforeRun", [&](void* arg) {
|
|
|
|
Env::Priority* pri = reinterpret_cast<Env::Priority*>(arg);
|
|
|
|
// First time is low pri pool in the test case.
|
|
|
|
if (low_pri_count == 0 && bottom_pri_count == 0) {
|
|
|
|
ASSERT_EQ(Env::Priority::LOW, *pri);
|
|
|
|
}
|
|
|
|
if (*pri == Env::Priority::LOW) {
|
|
|
|
low_pri_count++;
|
|
|
|
} else {
|
|
|
|
bottom_pri_count++;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
ASSERT_EQ(1, low_pri_count);
|
|
|
|
ASSERT_EQ(1, bottom_pri_count);
|
|
|
|
ASSERT_EQ("0,0,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// Recompact bottom most level uses bottom pool
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
ASSERT_EQ(1, low_pri_count);
|
|
|
|
ASSERT_EQ(2, bottom_pri_count);
|
|
|
|
|
|
|
|
env_->SetBackgroundThreads(0, Env::Priority::BOTTOM);
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
// Low pri pool is used if bottom pool has size 0.
|
|
|
|
ASSERT_EQ(2, low_pri_count);
|
|
|
|
ASSERT_EQ(2, bottom_pri_count);
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, DisableStatsUpdateReopen) {
|
|
|
|
uint64_t db_size[3];
|
|
|
|
for (int test = 0; test < 2; ++test) {
|
|
|
|
Options options = DeletionTriggerOptions(CurrentOptions());
|
|
|
|
options.skip_stats_update_on_db_open = (test == 0);
|
|
|
|
|
|
|
|
env_->random_read_counter_.Reset();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
// round 1 --- insert key/value pairs.
|
|
|
|
const int kTestSize = kCDTKeysPerBuffer * 512;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
for (int k = 0; k < kTestSize; ++k) {
|
|
|
|
values.push_back(rnd.RandomString(kCDTValueSize));
|
|
|
|
ASSERT_OK(Put(Key(k), values[k]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
// L1 and L2 can fit deletions iff size compensation does not take effect,
|
|
|
|
// i.e., when `skip_stats_update_on_db_open == true`. Move any remaining
|
|
|
|
// files at or above L2 down to L3 to ensure obsolete data does not
|
|
|
|
// accidentally meet its tombstone above L3. This makes the final size more
|
|
|
|
// deterministic and easy to see whether size compensation for deletions
|
|
|
|
// took effect.
|
|
|
|
MoveFilesToLevel(3 /* level */);
|
|
|
|
ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
|
|
|
|
Close();
|
|
|
|
|
|
|
|
// round 2 --- disable auto-compactions and issue deletions.
|
|
|
|
options.create_if_missing = false;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
|
|
|
|
env_->random_read_counter_.Reset();
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
for (int k = 0; k < kTestSize; ++k) {
|
|
|
|
ASSERT_OK(Delete(Key(k)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
|
|
|
|
Close();
|
|
|
|
// as auto_compaction is off, we shouldn't see any reduction in db size.
|
|
|
|
ASSERT_LE(db_size[0], db_size[1]);
|
|
|
|
|
|
|
|
// round 3 --- reopen db with auto_compaction on and see if
|
|
|
|
// deletion compensation still work.
|
|
|
|
options.disable_auto_compactions = false;
|
|
|
|
Reopen(options);
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2]));
|
|
|
|
|
|
|
|
if (options.skip_stats_update_on_db_open) {
|
|
|
|
// If update stats on DB::Open is disable, we don't expect
|
|
|
|
// deletion entries taking effect.
|
|
|
|
//
|
|
|
|
// The deletions are small enough to fit in L1 and L2, and obsolete keys
|
|
|
|
// were moved to L3+, so none of the original data should have been
|
|
|
|
// dropped.
|
|
|
|
ASSERT_LE(db_size[0], db_size[2]);
|
|
|
|
} else {
|
|
|
|
// Otherwise, we should see a significant drop in db size.
|
|
|
|
//
|
|
|
|
// See "CompactionDeletionTrigger" test for proof that at most
|
|
|
|
// `db_size[0] / 2` of the original data remains.
|
|
|
|
ASSERT_GT(db_size[0] / 2, db_size[2]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, CompactionTrigger) {
|
|
|
|
const int kNumKeysPerFile = 100;
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 110 << 10; // 110KB
|
|
|
|
options.arena_block_size = 4 << 10;
|
|
|
|
options.num_levels = 3;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(kNumKeysPerFile));
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
|
|
|
|
num++) {
|
|
|
|
std::vector<std::string> values;
|
|
|
|
// Write 100KB (100 values, each 1K)
|
|
|
|
for (int i = 0; i < kNumKeysPerFile; i++) {
|
|
|
|
values.push_back(rnd.RandomString(990));
|
|
|
|
ASSERT_OK(Put(1, Key(i), values[i]));
|
|
|
|
}
|
|
|
|
// put extra key to trigger flush
|
|
|
|
ASSERT_OK(Put(1, "", ""));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
// generate one more file in level-0, and should trigger level-0 compaction
|
|
|
|
std::vector<std::string> values;
|
|
|
|
for (int i = 0; i < kNumKeysPerFile; i++) {
|
|
|
|
values.push_back(rnd.RandomString(990));
|
|
|
|
ASSERT_OK(Put(1, Key(i), values[i]));
|
|
|
|
}
|
|
|
|
// put extra key to trigger flush
|
|
|
|
ASSERT_OK(Put(1, "", ""));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1);
|
|
|
|
}
|
|
|
|
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
TEST_F(DBCompactionTest, BGCompactionsAllowed) {
|
|
|
|
// Create several column families. Make compaction triggers in all of them
|
|
|
|
// and see number of compactions scheduled to be less than allowed.
|
|
|
|
const int kNumKeysPerFile = 100;
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
options.write_buffer_size = 110 << 10; // 110KB
|
|
|
|
options.arena_block_size = 4 << 10;
|
|
|
|
options.num_levels = 3;
|
|
|
|
// Should speed up compaction when there are 4 files.
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.level0_slowdown_writes_trigger = 20;
|
|
|
|
options.soft_pending_compaction_bytes_limit = 1 << 30; // Infinitely large
|
|
|
|
options.max_background_compactions = 3;
|
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(kNumKeysPerFile));
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
|
|
|
|
// Block all threads in thread pool.
|
|
|
|
const size_t kTotalTasks = 4;
|
|
|
|
env_->SetBackgroundThreads(4, Env::LOW);
|
|
|
|
test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
|
|
|
|
for (size_t i = 0; i < kTotalTasks; i++) {
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
|
|
|
|
&sleeping_tasks[i], Env::Priority::LOW);
|
|
|
|
sleeping_tasks[i].WaitUntilSleeping();
|
|
|
|
}
|
|
|
|
|
|
|
|
CreateAndReopenWithCF({"one", "two", "three"}, options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int cf = 0; cf < 4; cf++) {
|
|
|
|
for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
|
|
|
|
for (int i = 0; i < kNumKeysPerFile; i++) {
|
|
|
|
ASSERT_OK(Put(cf, Key(i), ""));
|
|
|
|
}
|
|
|
|
// put extra key to trigger flush
|
|
|
|
ASSERT_OK(Put(cf, "", ""));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now all column families qualify compaction but only one should be
|
|
|
|
// scheduled, because no column family hits speed up condition.
|
|
|
|
ASSERT_EQ(1u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
|
|
|
|
// Create two more files for one column family, which triggers speed up
|
|
|
|
// condition, three compactions will be scheduled.
|
|
|
|
for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
|
|
|
|
for (int i = 0; i < kNumKeysPerFile; i++) {
|
|
|
|
ASSERT_OK(Put(2, Key(i), ""));
|
|
|
|
}
|
|
|
|
// put extra key to trigger flush
|
|
|
|
ASSERT_OK(Put(2, "", ""));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
|
|
|
|
NumTableFilesAtLevel(0, 2));
|
|
|
|
}
|
|
|
|
ASSERT_EQ(3U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
|
|
|
|
// Unblock all threads to unblock all compactions.
|
|
|
|
for (size_t i = 0; i < kTotalTasks; i++) {
|
|
|
|
sleeping_tasks[i].WakeUp();
|
|
|
|
sleeping_tasks[i].WaitUntilDone();
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
|
|
|
|
// Verify number of compactions allowed will come back to 1.
|
|
|
|
|
|
|
|
for (size_t i = 0; i < kTotalTasks; i++) {
|
|
|
|
sleeping_tasks[i].Reset();
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
|
|
|
|
&sleeping_tasks[i], Env::Priority::LOW);
|
|
|
|
sleeping_tasks[i].WaitUntilSleeping();
|
|
|
|
}
|
|
|
|
for (int cf = 0; cf < 4; cf++) {
|
|
|
|
for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
|
|
|
|
for (int i = 0; i < kNumKeysPerFile; i++) {
|
|
|
|
ASSERT_OK(Put(cf, Key(i), ""));
|
|
|
|
}
|
|
|
|
// put extra key to trigger flush
|
|
|
|
ASSERT_OK(Put(cf, "", ""));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now all column families qualify compaction but only one should be
|
|
|
|
// scheduled, because no column family hits speed up condition.
|
|
|
|
ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
|
|
|
|
for (size_t i = 0; i < kTotalTasks; i++) {
|
|
|
|
sleeping_tasks[i].WakeUp();
|
|
|
|
sleeping_tasks[i].WaitUntilDone();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 100000000; // Large write buffer
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
// Write 8MB (80 values, each 100K)
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
|
|
|
|
std::vector<std::string> values;
|
|
|
|
for (int i = 0; i < 80; i++) {
|
|
|
|
values.push_back(rnd.RandomString(100000));
|
|
|
|
ASSERT_OK(Put(1, Key(i), values[i]));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reopening moves updates to level-0
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, options);
|
|
|
|
ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
|
|
|
|
true /* disallow trivial move */));
|
|
|
|
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
|
|
|
|
ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
|
|
|
|
for (int i = 0; i < 80; i++) {
|
|
|
|
ASSERT_EQ(Get(1, Key(i)), values[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, MinorCompactionsHappen) {
|
|
|
|
do {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 10000;
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
|
|
|
const int N = 500;
|
|
|
|
|
|
|
|
int starting_num_tables = TotalTableFiles(1);
|
|
|
|
for (int i = 0; i < N; i++) {
|
|
|
|
ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v')));
|
|
|
|
}
|
|
|
|
int ending_num_tables = TotalTableFiles(1);
|
|
|
|
ASSERT_GT(ending_num_tables, starting_num_tables);
|
|
|
|
|
|
|
|
for (int i = 0; i < N; i++) {
|
|
|
|
ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
|
|
|
|
}
|
|
|
|
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, options);
|
|
|
|
|
|
|
|
for (int i = 0; i < N; i++) {
|
|
|
|
ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
|
|
|
|
}
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, UserKeyCrossFile1) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// create first file and flush to l0
|
|
|
|
ASSERT_OK(Put("4", "A"));
|
|
|
|
ASSERT_OK(Put("3", "A"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
|
|
|
|
ASSERT_OK(Put("2", "A"));
|
|
|
|
ASSERT_OK(Delete("3"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get("3"));
|
|
|
|
|
|
|
|
// move both files down to l1
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get("3"));
|
|
|
|
|
|
|
|
for (int i = 0; i < 3; i++) {
|
|
|
|
ASSERT_OK(Put("2", "B"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get("3"));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, UserKeyCrossFile2) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// create first file and flush to l0
|
|
|
|
ASSERT_OK(Put("4", "A"));
|
|
|
|
ASSERT_OK(Put("3", "A"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
|
|
|
|
ASSERT_OK(Put("2", "A"));
|
|
|
|
ASSERT_OK(SingleDelete("3"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get("3"));
|
|
|
|
|
|
|
|
// move both files down to l1
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get("3"));
|
|
|
|
|
|
|
|
for (int i = 0; i < 3; i++) {
|
|
|
|
ASSERT_OK(Put("2", "B"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get("3"));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionSstPartitioner) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
std::shared_ptr<SstPartitionerFactory> factory(
|
|
|
|
NewSstPartitionerFixedPrefixFactory(4));
|
|
|
|
options.sst_partitioner_factory = factory;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// create first file and flush to l0
|
|
|
|
ASSERT_OK(Put("aaaa1", "A"));
|
|
|
|
ASSERT_OK(Put("bbbb1", "B"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
|
|
|
|
ASSERT_OK(Put("aaaa1", "A2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
|
|
|
|
// move both files down to l1
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
|
|
|
|
std::vector<LiveFileMetaData> files;
|
|
|
|
dbfull()->GetLiveFilesMetaData(&files);
|
|
|
|
ASSERT_EQ(2, files.size());
|
|
|
|
ASSERT_EQ("A2", Get("aaaa1"));
|
|
|
|
ASSERT_EQ("B", Get("bbbb1"));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionSstPartitionWithManualCompaction) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// create first file and flush to l0
|
|
|
|
ASSERT_OK(Put("000015", "A"));
|
|
|
|
ASSERT_OK(Put("000025", "B"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
|
|
|
|
// create second file and flush to l0
|
|
|
|
ASSERT_OK(Put("000015", "A2"));
|
|
|
|
ASSERT_OK(Put("000025", "B2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
|
|
|
|
// CONTROL 1: compact without partitioner
|
|
|
|
CompactRangeOptions compact_options;
|
|
|
|
compact_options.bottommost_level_compaction =
|
|
|
|
BottommostLevelCompaction::kForceOptimized;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
|
|
|
|
// Check (compacted but no partitioning yet)
|
|
|
|
std::vector<LiveFileMetaData> files;
|
|
|
|
dbfull()->GetLiveFilesMetaData(&files);
|
|
|
|
ASSERT_EQ(1, files.size());
|
|
|
|
|
|
|
|
// Install partitioner
|
|
|
|
std::shared_ptr<SstPartitionerFactory> factory(
|
|
|
|
NewSstPartitionerFixedPrefixFactory(5));
|
|
|
|
options.sst_partitioner_factory = factory;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// CONTROL 2: request compaction on range with no partition boundary and no
|
|
|
|
// overlap with actual entries
|
|
|
|
Slice from("000017");
|
|
|
|
Slice to("000019");
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(compact_options, &from, &to));
|
|
|
|
|
|
|
|
// Check (no partitioning yet)
|
|
|
|
files.clear();
|
|
|
|
dbfull()->GetLiveFilesMetaData(&files);
|
|
|
|
ASSERT_EQ(1, files.size());
|
|
|
|
ASSERT_EQ("A2", Get("000015"));
|
|
|
|
ASSERT_EQ("B2", Get("000025"));
|
|
|
|
|
|
|
|
// TEST: request compaction overlapping with partition boundary but no
|
|
|
|
// actual entries
|
|
|
|
// NOTE: `to` is INCLUSIVE
|
|
|
|
from = Slice("000019");
|
|
|
|
to = Slice("000020");
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(compact_options, &from, &to));
|
|
|
|
|
|
|
|
// Check (must be partitioned)
|
|
|
|
files.clear();
|
|
|
|
dbfull()->GetLiveFilesMetaData(&files);
|
|
|
|
ASSERT_EQ(2, files.size());
|
|
|
|
ASSERT_EQ("A2", Get("000015"));
|
|
|
|
ASSERT_EQ("B2", Get("000025"));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.level0_file_num_compaction_trigger = 1;
|
|
|
|
std::shared_ptr<SstPartitionerFactory> factory(
|
|
|
|
NewSstPartitionerFixedPrefixFactory(4));
|
|
|
|
options.sst_partitioner_factory = factory;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// create first file and flush to l0
|
|
|
|
ASSERT_OK(Put("aaaa1", "A"));
|
|
|
|
ASSERT_OK(Put("bbbb1", "B"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
|
|
|
|
|
|
|
|
std::vector<LiveFileMetaData> files;
|
|
|
|
dbfull()->GetLiveFilesMetaData(&files);
|
|
|
|
ASSERT_EQ(2, files.size());
|
|
|
|
ASSERT_EQ("A", Get("aaaa1"));
|
|
|
|
ASSERT_EQ("B", Get("bbbb1"));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, ZeroSeqIdCompaction) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
|
|
|
|
FlushedFileCollector* collector = new FlushedFileCollector();
|
|
|
|
options.listeners.emplace_back(collector);
|
|
|
|
|
|
|
|
// compaction options
|
|
|
|
CompactionOptions compact_opt;
|
|
|
|
compact_opt.compression = kNoCompression;
|
|
|
|
compact_opt.output_file_size_limit = 4096;
|
|
|
|
const size_t key_len =
|
|
|
|
static_cast<size_t>(compact_opt.output_file_size_limit) / 5;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
std::vector<const Snapshot*> snaps;
|
|
|
|
|
|
|
|
// create first file and flush to l0
|
|
|
|
for (auto& key : {"1", "2", "3", "3", "3", "3"}) {
|
|
|
|
ASSERT_OK(Put(key, std::string(key_len, 'A')));
|
|
|
|
snaps.push_back(dbfull()->GetSnapshot());
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
|
|
|
|
// create second file and flush to l0
|
|
|
|
for (auto& key : {"3", "4", "5", "6", "7", "8"}) {
|
|
|
|
ASSERT_OK(Put(key, std::string(key_len, 'A')));
|
|
|
|
snaps.push_back(dbfull()->GetSnapshot());
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
|
|
|
|
// move both files down to l1
|
|
|
|
ASSERT_OK(
|
|
|
|
dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1));
|
|
|
|
|
|
|
|
// release snap so that first instance of key(3) can have seqId=0
|
|
|
|
for (auto snap : snaps) {
|
|
|
|
dbfull()->ReleaseSnapshot(snap);
|
|
|
|
}
|
|
|
|
|
|
|
|
// create 3 files in l0 so to trigger compaction
|
|
|
|
for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
|
|
|
|
ASSERT_OK(Put("2", std::string(1, 'A')));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_OK(Put("", ""));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) {
|
|
|
|
// github issue #2249
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// create two files in l1 that we can compact
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) {
|
|
|
|
ASSERT_OK(Put(std::to_string(2 * i), std::string(1, 'A')));
|
|
|
|
ASSERT_OK(Put(std::to_string(2 * i + 1), std::string(1, 'A')));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
}
|
|
|
|
ASSERT_OK(
|
|
|
|
dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}}));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
|
|
|
|
ASSERT_OK(
|
|
|
|
dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}}));
|
|
|
|
|
|
|
|
ColumnFamilyMetaData cf_meta;
|
|
|
|
dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
|
|
|
|
ASSERT_EQ(2, cf_meta.levels[1].files.size());
|
|
|
|
std::vector<std::string> input_filenames;
|
|
|
|
for (const auto& sst_file : cf_meta.levels[1].files) {
|
|
|
|
input_filenames.push_back(sst_file.name);
|
|
|
|
}
|
|
|
|
|
|
|
|
// note CompactionOptions::output_file_size_limit is unset.
|
|
|
|
CompactionOptions compact_opt;
|
|
|
|
compact_opt.compression = kNoCompression;
|
|
|
|
ASSERT_OK(dbfull()->CompactFiles(compact_opt, input_filenames, 1));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check that writes done during a memtable compaction are recovered
|
|
|
|
// if the database is shutdown during the memtable compaction.
|
|
|
|
TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) {
|
|
|
|
do {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
|
|
|
// Trigger a long memtable compaction and reopen the database during it
|
|
|
|
ASSERT_OK(Put(1, "foo", "v1")); // Goes to 1st log file
|
|
|
|
ASSERT_OK(Put(1, "big1", std::string(10000000, 'x'))); // Fills memtable
|
|
|
|
ASSERT_OK(Put(1, "big2", std::string(1000, 'y'))); // Triggers compaction
|
|
|
|
ASSERT_OK(Put(1, "bar", "v2")); // Goes to new log file
|
|
|
|
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, options);
|
|
|
|
ASSERT_EQ("v1", Get(1, "foo"));
|
|
|
|
ASSERT_EQ("v2", Get(1, "bar"));
|
|
|
|
ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1"));
|
|
|
|
ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2"));
|
|
|
|
} while (ChangeOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
|
|
|
|
int32_t trivial_move = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:TrivialMove",
|
|
|
|
[&](void* /*arg*/) { trivial_move++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 100000000;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
int32_t num_keys = 80;
|
|
|
|
int32_t value_size = 100 * 1024; // 100 KB
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
std::vector<std::string> values;
|
|
|
|
for (int i = 0; i < num_keys; i++) {
|
|
|
|
values.push_back(rnd.RandomString(value_size));
|
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reopening moves updates to L0
|
|
|
|
Reopen(options);
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0, 0), 1); // 1 file in L0
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // 0 files in L1
|
|
|
|
|
|
|
|
std::vector<LiveFileMetaData> metadata;
|
|
|
|
db_->GetLiveFilesMetaData(&metadata);
|
|
|
|
ASSERT_EQ(metadata.size(), 1U);
|
|
|
|
LiveFileMetaData level0_file = metadata[0]; // L0 file meta
|
|
|
|
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = exclusive_manual_compaction_;
|
|
|
|
|
|
|
|
// Compaction will initiate a trivial move from L0 to L1
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
|
|
|
|
// File moved From L0 to L1
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); // 0 files in L0
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1); // 1 file in L1
|
|
|
|
|
|
|
|
metadata.clear();
|
|
|
|
db_->GetLiveFilesMetaData(&metadata);
|
|
|
|
ASSERT_EQ(metadata.size(), 1U);
|
|
|
|
ASSERT_EQ(metadata[0].name /* level1_file.name */, level0_file.name);
|
|
|
|
ASSERT_EQ(metadata[0].size /* level1_file.size */, level0_file.size);
|
|
|
|
|
|
|
|
for (int i = 0; i < num_keys; i++) {
|
|
|
|
ASSERT_EQ(Get(Key(i)), values[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_EQ(trivial_move, 1);
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
|
|
|
|
int32_t trivial_move = 0;
|
|
|
|
int32_t non_trivial_move = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:TrivialMove",
|
|
|
|
[&](void* /*arg*/) { trivial_move++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:NonTrivial",
|
|
|
|
[&](void* /*arg*/) { non_trivial_move++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.write_buffer_size = 10 * 1024 * 1024;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
// non overlapping ranges
|
|
|
|
std::vector<std::pair<int32_t, int32_t>> ranges = {
|
|
|
|
{100, 199}, {300, 399}, {0, 99}, {200, 299},
|
|
|
|
{600, 699}, {400, 499}, {500, 550}, {551, 599},
|
|
|
|
};
|
|
|
|
int32_t value_size = 10 * 1024; // 10 KB
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
std::map<int32_t, std::string> values;
|
|
|
|
for (size_t i = 0; i < ranges.size(); i++) {
|
|
|
|
for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
|
|
|
|
values[j] = rnd.RandomString(value_size);
|
|
|
|
ASSERT_OK(Put(Key(j), values[j]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t level0_files = NumTableFilesAtLevel(0, 0);
|
|
|
|
ASSERT_EQ(level0_files, ranges.size()); // Multiple files in L0
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1
|
|
|
|
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = exclusive_manual_compaction_;
|
|
|
|
|
|
|
|
// Since data is non-overlapping we expect compaction to initiate
|
|
|
|
// a trivial move
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
// We expect that all the files were trivially moved from L0 to L1
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < ranges.size(); i++) {
|
|
|
|
for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
|
|
|
|
ASSERT_EQ(Get(Key(j)), values[j]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_EQ(trivial_move, 1);
|
|
|
|
ASSERT_EQ(non_trivial_move, 0);
|
|
|
|
|
|
|
|
trivial_move = 0;
|
|
|
|
non_trivial_move = 0;
|
|
|
|
values.clear();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
// Same ranges as above but overlapping
|
|
|
|
ranges = {
|
|
|
|
{100, 199},
|
|
|
|
{300, 399},
|
|
|
|
{0, 99},
|
|
|
|
{200, 299},
|
|
|
|
{600, 699},
|
|
|
|
{400, 499},
|
|
|
|
{500, 560}, // this range overlap with the next
|
|
|
|
// one
|
|
|
|
{551, 599},
|
|
|
|
};
|
|
|
|
for (size_t i = 0; i < ranges.size(); i++) {
|
|
|
|
for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
|
|
|
|
values[j] = rnd.RandomString(value_size);
|
|
|
|
ASSERT_OK(Put(Key(j), values[j]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
|
|
|
|
for (size_t i = 0; i < ranges.size(); i++) {
|
|
|
|
for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
|
|
|
|
ASSERT_EQ(Get(Key(j)), values[j]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_EQ(trivial_move, 0);
|
|
|
|
ASSERT_EQ(non_trivial_move, 1);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) {
|
|
|
|
int32_t trivial_move = 0;
|
|
|
|
int32_t non_trivial_move = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:TrivialMove",
|
|
|
|
[&](void* /*arg*/) { trivial_move++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:NonTrivial",
|
|
|
|
[&](void* /*arg*/) { non_trivial_move++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.write_buffer_size = 10 * 1024 * 1024;
|
|
|
|
options.num_levels = 7;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
int32_t value_size = 10 * 1024; // 10 KB
|
|
|
|
|
|
|
|
// Add 2 non-overlapping files
|
|
|
|
Random rnd(301);
|
|
|
|
std::map<int32_t, std::string> values;
|
|
|
|
|
|
|
|
// file 1 [0 => 300]
|
|
|
|
for (int32_t i = 0; i <= 300; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// file 2 [600 => 700]
|
|
|
|
for (int32_t i = 600; i <= 700; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// 2 files in L0
|
|
|
|
ASSERT_EQ("2", FilesPerLevel(0));
|
|
|
|
CompactRangeOptions compact_options;
|
|
|
|
compact_options.change_level = true;
|
|
|
|
compact_options.target_level = 6;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
|
|
|
|
ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
|
|
|
|
// 2 files in L6
|
|
|
|
ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
ASSERT_EQ(trivial_move, 1);
|
|
|
|
ASSERT_EQ(non_trivial_move, 0);
|
|
|
|
|
|
|
|
for (int32_t i = 0; i <= 300; i++) {
|
|
|
|
ASSERT_EQ(Get(Key(i)), values[i]);
|
|
|
|
}
|
|
|
|
for (int32_t i = 600; i <= 700; i++) {
|
|
|
|
ASSERT_EQ(Get(Key(i)), values[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, PartialOverlappingL0) {
|
|
|
|
class SubCompactionEventListener : public EventListener {
|
|
|
|
public:
|
|
|
|
void OnSubcompactionCompleted(const SubcompactionJobInfo&) override {
|
|
|
|
sub_compaction_finished_++;
|
|
|
|
}
|
|
|
|
std::atomic<int> sub_compaction_finished_{0};
|
|
|
|
};
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.write_buffer_size = 10 * 1024 * 1024;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
SubCompactionEventListener* listener = new SubCompactionEventListener();
|
|
|
|
options.listeners.emplace_back(listener);
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// For subcompactino to trigger, output level needs to be non-empty.
|
|
|
|
ASSERT_OK(Put("key", ""));
|
|
|
|
ASSERT_OK(Put("kez", ""));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("key", ""));
|
|
|
|
ASSERT_OK(Put("kez", ""));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
|
|
|
|
// Ranges that are only briefly overlapping so that they won't be trivially
|
|
|
|
// moved but subcompaction ranges would only contain a subset of files.
|
|
|
|
std::vector<std::pair<int32_t, int32_t>> ranges = {
|
|
|
|
{100, 199}, {198, 399}, {397, 600}, {598, 800}, {799, 900}, {895, 999},
|
|
|
|
};
|
|
|
|
int32_t value_size = 10 * 1024; // 10 KB
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
std::map<int32_t, std::string> values;
|
|
|
|
for (size_t i = 0; i < ranges.size(); i++) {
|
|
|
|
for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
|
|
|
|
values[j] = rnd.RandomString(value_size);
|
|
|
|
ASSERT_OK(Put(Key(j), values[j]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t level0_files = NumTableFilesAtLevel(0, 0);
|
|
|
|
ASSERT_EQ(level0_files, ranges.size()); // Multiple files in L0
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1); // One file in L1
|
|
|
|
|
|
|
|
listener->sub_compaction_finished_ = 0;
|
|
|
|
ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
if (max_subcompactions_ > 3) {
|
|
|
|
// RocksDB might not generate the exact number of sub compactions.
|
|
|
|
// Here we validate that at least subcompaction happened.
|
|
|
|
ASSERT_GT(listener->sub_compaction_finished_.load(), 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
// We expect that all the files were compacted to L1
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
|
|
|
|
ASSERT_GT(NumTableFilesAtLevel(1, 0), 1);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < ranges.size(); i++) {
|
|
|
|
for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
|
|
|
|
ASSERT_EQ(Get(Key(j)), values[j]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) {
|
|
|
|
int32_t trivial_move = 0;
|
|
|
|
int32_t non_trivial_move = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
"DBImpl::BackgroundCompaction:TrivialMove",
|
|
|
|
[&](void* /*arg*/) { trivial_move++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
"DBImpl::BackgroundCompaction:NonTrivial",
|
|
|
|
[&](void* /*arg*/) { non_trivial_move++; });
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
bool first = true;
|
|
|
|
// Purpose of dependencies:
|
|
|
|
// 4 -> 1: ensure the order of two non-trivial compactions
|
|
|
|
// 5 -> 2 and 5 -> 3: ensure we do a check before two non-trivial compactions
|
|
|
|
// are installed
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
{{"DBCompaction::ManualPartial:4", "DBCompaction::ManualPartial:1"},
|
|
|
|
{"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:2"},
|
|
|
|
{"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:3"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
if (first) {
|
|
|
|
first = false;
|
|
|
|
TEST_SYNC_POINT("DBCompaction::ManualPartial:4");
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
TEST_SYNC_POINT("DBCompaction::ManualPartial:3");
|
|
|
|
} else { // second non-trivial compaction
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
TEST_SYNC_POINT("DBCompaction::ManualPartial:2");
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 10 * 1024 * 1024;
|
|
|
|
options.num_levels = 7;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
options.max_background_compactions = 3;
|
|
|
|
options.target_file_size_base = 1 << 23; // 8 MB
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
int32_t value_size = 10 * 1024; // 10 KB
|
|
|
|
|
|
|
|
// Add 2 non-overlapping files
|
|
|
|
Random rnd(301);
|
|
|
|
std::map<int32_t, std::string> values;
|
|
|
|
|
|
|
|
// file 1 [0 => 100]
|
|
|
|
for (int32_t i = 0; i < 100; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// file 2 [100 => 300]
|
|
|
|
for (int32_t i = 100; i < 300; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// 2 files in L0
|
|
|
|
ASSERT_EQ("2", FilesPerLevel(0));
|
|
|
|
CompactRangeOptions compact_options;
|
|
|
|
compact_options.change_level = true;
|
|
|
|
compact_options.target_level = 6;
|
|
|
|
compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
|
|
|
|
// Trivial move the two non-overlapping files to level 6
|
|
|
|
ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
|
|
|
|
// 2 files in L6
|
|
|
|
ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
ASSERT_EQ(trivial_move, 1);
|
|
|
|
ASSERT_EQ(non_trivial_move, 0);
|
|
|
|
|
|
|
|
// file 3 [ 0 => 200]
|
|
|
|
for (int32_t i = 0; i < 200; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// 1 files in L0
|
|
|
|
ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel(0));
|
|
|
|
ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
|
|
|
|
ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, false));
|
|
|
|
ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, nullptr, false));
|
|
|
|
ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, false));
|
|
|
|
ASSERT_OK(dbfull()->TEST_CompactRange(4, nullptr, nullptr, nullptr, false));
|
|
|
|
// 2 files in L6, 1 file in L5
|
|
|
|
ASSERT_EQ("0,0,0,0,0,1,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
ASSERT_EQ(trivial_move, 6);
|
|
|
|
ASSERT_EQ(non_trivial_move, 0);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::port::Thread threads([&] {
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
compact_options.change_level = false;
|
|
|
|
compact_options.exclusive_manual_compaction = false;
|
|
|
|
std::string begin_string = Key(0);
|
|
|
|
std::string end_string = Key(199);
|
|
|
|
Slice begin(begin_string);
|
|
|
|
Slice end(end_string);
|
|
|
|
// First non-trivial compaction is triggered
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
|
|
|
|
});
|
|
|
|
|
|
|
|
TEST_SYNC_POINT("DBCompaction::ManualPartial:1");
|
|
|
|
// file 4 [300 => 400)
|
|
|
|
for (int32_t i = 300; i <= 400; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// file 5 [400 => 500)
|
|
|
|
for (int32_t i = 400; i <= 500; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// file 6 [500 => 600)
|
|
|
|
for (int32_t i = 500; i <= 600; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
// Second non-trivial compaction is triggered
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// Before two non-trivial compactions are installed, there are 3 files in L0
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0));
|
|
|
|
TEST_SYNC_POINT("DBCompaction::ManualPartial:5");
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
// After two non-trivial compactions are installed, there is 1 file in L6, and
|
|
|
|
// 1 file in L1
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0));
|
|
|
|
threads.join();
|
|
|
|
|
|
|
|
for (int32_t i = 0; i < 600; i++) {
|
|
|
|
ASSERT_EQ(Get(Key(i)), values[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Disable as the test is flaky.
|
|
|
|
TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) {
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
int32_t trivial_move = 0;
|
|
|
|
int32_t non_trivial_move = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
"DBImpl::BackgroundCompaction:TrivialMove",
|
|
|
|
[&](void* /*arg*/) { trivial_move++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
"DBImpl::BackgroundCompaction:NonTrivial",
|
|
|
|
[&](void* /*arg*/) { non_trivial_move++; });
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
bool first = true;
|
|
|
|
bool second = true;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
{{"DBCompaction::PartialFill:4", "DBCompaction::PartialFill:1"},
|
|
|
|
{"DBCompaction::PartialFill:2", "DBCompaction::PartialFill:3"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
if (first) {
|
|
|
|
TEST_SYNC_POINT("DBCompaction::PartialFill:4");
|
|
|
|
first = false;
|
|
|
|
TEST_SYNC_POINT("DBCompaction::PartialFill:3");
|
|
|
|
} else if (second) {
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 10 * 1024 * 1024;
|
|
|
|
options.max_bytes_for_level_multiplier = 2;
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
options.max_background_compactions = 3;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
// make sure all background compaction jobs can be scheduled
|
|
|
|
auto stop_token =
|
|
|
|
dbfull()->TEST_write_controler().GetCompactionPressureToken();
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
int32_t value_size = 10 * 1024; // 10 KB
|
|
|
|
|
|
|
|
// Add 2 non-overlapping files
|
|
|
|
Random rnd(301);
|
|
|
|
std::map<int32_t, std::string> values;
|
|
|
|
|
|
|
|
// file 1 [0 => 100]
|
|
|
|
for (int32_t i = 0; i < 100; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// file 2 [100 => 300]
|
|
|
|
for (int32_t i = 100; i < 300; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// 2 files in L0
|
|
|
|
ASSERT_EQ("2", FilesPerLevel(0));
|
|
|
|
CompactRangeOptions compact_options;
|
|
|
|
compact_options.change_level = true;
|
|
|
|
compact_options.target_level = 2;
|
|
|
|
ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
|
|
|
|
// 2 files in L2
|
|
|
|
ASSERT_EQ("0,0,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
ASSERT_EQ(trivial_move, 1);
|
|
|
|
ASSERT_EQ(non_trivial_move, 0);
|
|
|
|
|
|
|
|
// file 3 [ 0 => 200]
|
|
|
|
for (int32_t i = 0; i < 200; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// 2 files in L2, 1 in L0
|
|
|
|
ASSERT_EQ("1,0,2", FilesPerLevel(0));
|
|
|
|
ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
|
|
|
|
// 2 files in L2, 1 in L1
|
|
|
|
ASSERT_EQ("0,1,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
ASSERT_EQ(trivial_move, 2);
|
|
|
|
ASSERT_EQ(non_trivial_move, 0);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::port::Thread threads([&] {
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
compact_options.change_level = false;
|
|
|
|
compact_options.exclusive_manual_compaction = false;
|
|
|
|
std::string begin_string = Key(0);
|
|
|
|
std::string end_string = Key(199);
|
|
|
|
Slice begin(begin_string);
|
|
|
|
Slice end(end_string);
|
|
|
|
ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
|
|
|
|
});
|
|
|
|
|
|
|
|
TEST_SYNC_POINT("DBCompaction::PartialFill:1");
|
|
|
|
// Many files 4 [300 => 4300)
|
|
|
|
for (int32_t i = 0; i <= 5; i++) {
|
|
|
|
for (int32_t j = 300; j < 4300; j++) {
|
|
|
|
if (j == 2300) {
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
}
|
|
|
|
values[j] = rnd.RandomString(value_size);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_OK(Put(Key(j), values[j]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Verify level sizes
|
|
|
|
uint64_t target_size = 4 * options.max_bytes_for_level_base;
|
|
|
|
for (int32_t i = 1; i < options.num_levels; i++) {
|
|
|
|
ASSERT_LE(SizeAtLevel(i), target_size);
|
|
|
|
target_size = static_cast<uint64_t>(target_size *
|
|
|
|
options.max_bytes_for_level_multiplier);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
}
|
|
|
|
|
|
|
|
TEST_SYNC_POINT("DBCompaction::PartialFill:2");
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
threads.join();
|
|
|
|
|
|
|
|
for (int32_t i = 0; i < 4300; i++) {
|
|
|
|
ASSERT_EQ(Get(Key(i)), values[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, ManualCompactionWithUnorderedWrite) {
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL",
|
|
|
|
"DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"},
|
|
|
|
{"DBImpl::WaitForPendingWrites:BeforeBlock",
|
|
|
|
"DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}});
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.unordered_write = true;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_OK(Put("foo", "v1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(Put("bar", "v1"));
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
port::Thread writer([&]() { ASSERT_OK(Put("foo", "v2")); });
|
|
|
|
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL");
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
|
|
|
|
writer.join();
|
|
|
|
ASSERT_EQ(Get("foo"), "v2");
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
ASSERT_EQ(Get("foo"), "v2");
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, DeleteFileRange) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 10 * 1024 * 1024;
|
|
|
|
options.max_bytes_for_level_multiplier = 2;
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
options.max_background_compactions = 3;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
int32_t value_size = 10 * 1024; // 10 KB
|
|
|
|
|
|
|
|
// Add 2 non-overlapping files
|
|
|
|
Random rnd(301);
|
|
|
|
std::map<int32_t, std::string> values;
|
|
|
|
|
|
|
|
// file 1 [0 => 100]
|
|
|
|
for (int32_t i = 0; i < 100; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// file 2 [100 => 300]
|
|
|
|
for (int32_t i = 100; i < 300; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// 2 files in L0
|
|
|
|
ASSERT_EQ("2", FilesPerLevel(0));
|
|
|
|
CompactRangeOptions compact_options;
|
|
|
|
compact_options.change_level = true;
|
|
|
|
compact_options.target_level = 2;
|
|
|
|
ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
|
|
|
|
// 2 files in L2
|
|
|
|
ASSERT_EQ("0,0,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// file 3 [ 0 => 200]
|
|
|
|
for (int32_t i = 0; i < 200; i++) {
|
|
|
|
values[i] = rnd.RandomString(value_size);
|
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// Many files 4 [300 => 4300)
|
|
|
|
for (int32_t i = 0; i <= 5; i++) {
|
|
|
|
for (int32_t j = 300; j < 4300; j++) {
|
|
|
|
if (j == 2300) {
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
}
|
|
|
|
values[j] = rnd.RandomString(value_size);
|
|
|
|
ASSERT_OK(Put(Key(j), values[j]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
// Verify level sizes
|
|
|
|
uint64_t target_size = 4 * options.max_bytes_for_level_base;
|
|
|
|
for (int32_t i = 1; i < options.num_levels; i++) {
|
|
|
|
ASSERT_LE(SizeAtLevel(i), target_size);
|
|
|
|
target_size = static_cast<uint64_t>(target_size *
|
|
|
|
options.max_bytes_for_level_multiplier);
|
|
|
|
}
|
|
|
|
|
|
|
|
const size_t old_num_files = CountFiles();
|
|
|
|
std::string begin_string = Key(1000);
|
|
|
|
std::string end_string = Key(2000);
|
|
|
|
Slice begin(begin_string);
|
|
|
|
Slice end(end_string);
|
|
|
|
ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
|
|
|
|
|
|
|
|
int32_t deleted_count = 0;
|
|
|
|
for (int32_t i = 0; i < 4300; i++) {
|
|
|
|
if (i < 1000 || i > 2000) {
|
|
|
|
ASSERT_EQ(Get(Key(i)), values[i]);
|
|
|
|
} else {
|
|
|
|
ReadOptions roptions;
|
|
|
|
std::string result;
|
|
|
|
Status s = db_->Get(roptions, Key(i), &result);
|
|
|
|
ASSERT_TRUE(s.IsNotFound() || s.ok());
|
|
|
|
if (s.IsNotFound()) {
|
|
|
|
deleted_count++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_GT(deleted_count, 0);
|
|
|
|
begin_string = Key(5000);
|
|
|
|
end_string = Key(6000);
|
|
|
|
Slice begin1(begin_string);
|
|
|
|
Slice end1(end_string);
|
|
|
|
// Try deleting files in range which contain no keys
|
|
|
|
ASSERT_OK(
|
|
|
|
DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1));
|
|
|
|
|
|
|
|
// Push data from level 0 to level 1 to force all data to be deleted
|
|
|
|
// Note that we don't delete level 0 files
|
|
|
|
compact_options.change_level = true;
|
|
|
|
compact_options.target_level = 1;
|
|
|
|
ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_OK(
|
|
|
|
DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr));
|
|
|
|
|
|
|
|
int32_t deleted_count2 = 0;
|
|
|
|
for (int32_t i = 0; i < 4300; i++) {
|
|
|
|
ReadOptions roptions;
|
|
|
|
std::string result;
|
|
|
|
ASSERT_TRUE(db_->Get(roptions, Key(i), &result).IsNotFound());
|
|
|
|
deleted_count2++;
|
|
|
|
}
|
|
|
|
ASSERT_GT(deleted_count2, deleted_count);
|
|
|
|
const size_t new_num_files = CountFiles();
|
|
|
|
ASSERT_GT(old_num_files, new_num_files);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, DeleteFilesInRanges) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 10 * 1024 * 1024;
|
|
|
|
options.max_bytes_for_level_multiplier = 2;
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.max_background_compactions = 3;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
int32_t value_size = 10 * 1024; // 10 KB
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
std::map<int32_t, std::string> values;
|
|
|
|
|
|
|
|
// file [0 => 100), [100 => 200), ... [900, 1000)
|
|
|
|
for (auto i = 0; i < 10; i++) {
|
|
|
|
for (auto j = 0; j < 100; j++) {
|
|
|
|
auto k = i * 100 + j;
|
|
|
|
values[k] = rnd.RandomString(value_size);
|
|
|
|
ASSERT_OK(Put(Key(k), values[k]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_EQ("10", FilesPerLevel(0));
|
|
|
|
CompactRangeOptions compact_options;
|
|
|
|
compact_options.change_level = true;
|
|
|
|
compact_options.target_level = 2;
|
|
|
|
ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
|
|
|
|
ASSERT_EQ("0,0,10", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// file [0 => 100), [200 => 300), ... [800, 900)
|
|
|
|
for (auto i = 0; i < 10; i += 2) {
|
|
|
|
for (auto j = 0; j < 100; j++) {
|
|
|
|
auto k = i * 100 + j;
|
|
|
|
ASSERT_OK(Put(Key(k), values[k]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_EQ("5,0,10", FilesPerLevel(0));
|
|
|
|
ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
|
|
|
|
ASSERT_EQ("0,5,10", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// Delete files in range [0, 299] (inclusive)
|
|
|
|
{
|
|
|
|
auto begin_str1 = Key(0), end_str1 = Key(100);
|
|
|
|
auto begin_str2 = Key(100), end_str2 = Key(200);
|
|
|
|
auto begin_str3 = Key(200), end_str3 = Key(299);
|
|
|
|
Slice begin1(begin_str1), end1(end_str1);
|
|
|
|
Slice begin2(begin_str2), end2(end_str2);
|
|
|
|
Slice begin3(begin_str3), end3(end_str3);
|
|
|
|
std::vector<RangePtr> ranges;
|
|
|
|
ranges.push_back(RangePtr(&begin1, &end1));
|
|
|
|
ranges.push_back(RangePtr(&begin2, &end2));
|
|
|
|
ranges.push_back(RangePtr(&begin3, &end3));
|
|
|
|
ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
|
|
|
|
ranges.data(), ranges.size()));
|
|
|
|
ASSERT_EQ("0,3,7", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// Keys [0, 300) should not exist.
|
|
|
|
for (auto i = 0; i < 300; i++) {
|
|
|
|
ReadOptions ropts;
|
|
|
|
std::string result;
|
|
|
|
auto s = db_->Get(ropts, Key(i), &result);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
}
|
|
|
|
for (auto i = 300; i < 1000; i++) {
|
|
|
|
ASSERT_EQ(Get(Key(i)), values[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Delete files in range [600, 999) (exclusive)
|
|
|
|
{
|
|
|
|
auto begin_str1 = Key(600), end_str1 = Key(800);
|
|
|
|
auto begin_str2 = Key(700), end_str2 = Key(900);
|
|
|
|
auto begin_str3 = Key(800), end_str3 = Key(999);
|
|
|
|
Slice begin1(begin_str1), end1(end_str1);
|
|
|
|
Slice begin2(begin_str2), end2(end_str2);
|
|
|
|
Slice begin3(begin_str3), end3(end_str3);
|
|
|
|
std::vector<RangePtr> ranges;
|
|
|
|
ranges.push_back(RangePtr(&begin1, &end1));
|
|
|
|
ranges.push_back(RangePtr(&begin2, &end2));
|
|
|
|
ranges.push_back(RangePtr(&begin3, &end3));
|
|
|
|
ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
|
|
|
|
ranges.data(), ranges.size(), false));
|
|
|
|
ASSERT_EQ("0,1,4", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// Keys [600, 900) should not exist.
|
|
|
|
for (auto i = 600; i < 900; i++) {
|
|
|
|
ReadOptions ropts;
|
|
|
|
std::string result;
|
|
|
|
auto s = db_->Get(ropts, Key(i), &result);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
}
|
|
|
|
for (auto i = 300; i < 600; i++) {
|
|
|
|
ASSERT_EQ(Get(Key(i)), values[i]);
|
|
|
|
}
|
|
|
|
for (auto i = 900; i < 1000; i++) {
|
|
|
|
ASSERT_EQ(Get(Key(i)), values[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Delete all files.
|
|
|
|
{
|
|
|
|
RangePtr range;
|
|
|
|
ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1));
|
|
|
|
ASSERT_EQ("", FilesPerLevel(0));
|
|
|
|
|
|
|
|
for (auto i = 0; i < 1000; i++) {
|
|
|
|
ReadOptions ropts;
|
|
|
|
std::string result;
|
|
|
|
auto s = db_->Get(ropts, Key(i), &result);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) {
|
|
|
|
// regression test for #2833: groups of files whose user-keys overlap at the
|
|
|
|
// endpoints could be split by `DeleteFilesInRange`. This caused old data to
|
|
|
|
// reappear, either because a new version of the key was removed, or a range
|
|
|
|
// deletion was partially dropped. It could also cause non-overlapping
|
|
|
|
// invariant to be violated if the files dropped by DeleteFilesInRange were
|
|
|
|
// a subset of files that a range deletion spans.
|
|
|
|
const int kNumL0Files = 2;
|
|
|
|
const int kValSize = 8 << 10; // 8KB
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0Files;
|
|
|
|
options.target_file_size_base = 1 << 10; // 1KB
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// The snapshot prevents key 1 from having its old version dropped. The low
|
|
|
|
// `target_file_size_base` ensures two keys will be in each output file.
|
|
|
|
const Snapshot* snapshot = nullptr;
|
|
|
|
Random rnd(301);
|
|
|
|
// The value indicates which flush the key belonged to, which is enough
|
|
|
|
// for us to determine the keys' relative ages. After L0 flushes finish,
|
|
|
|
// files look like:
|
|
|
|
//
|
|
|
|
// File 0: 0 -> vals[0], 1 -> vals[0]
|
|
|
|
// File 1: 1 -> vals[1], 2 -> vals[1]
|
|
|
|
//
|
|
|
|
// Then L0->L1 compaction happens, which outputs keys as follows:
|
|
|
|
//
|
|
|
|
// File 0: 0 -> vals[0], 1 -> vals[1]
|
|
|
|
// File 1: 1 -> vals[0], 2 -> vals[1]
|
|
|
|
//
|
|
|
|
// DeleteFilesInRange shouldn't be allowed to drop just file 0, as that
|
|
|
|
// would cause `1 -> vals[0]` (an older key) to reappear.
|
|
|
|
std::string vals[kNumL0Files];
|
|
|
|
for (int i = 0; i < kNumL0Files; ++i) {
|
|
|
|
vals[i] = rnd.RandomString(kValSize);
|
|
|
|
ASSERT_OK(Put(Key(i), vals[i]));
|
|
|
|
ASSERT_OK(Put(Key(i + 1), vals[i]));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
if (i == 0) {
|
|
|
|
snapshot = db_->GetSnapshot();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
// Verify `DeleteFilesInRange` can't drop only file 0 which would cause
|
|
|
|
// "1 -> vals[0]" to reappear.
|
|
|
|
std::string begin_str = Key(0), end_str = Key(1);
|
|
|
|
Slice begin = begin_str, end = end_str;
|
|
|
|
ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
|
|
|
|
ASSERT_EQ(vals[1], Get(Key(1)));
|
|
|
|
|
|
|
|
db_->ReleaseSnapshot(snapshot);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) {
|
|
|
|
int32_t trivial_move = 0;
|
|
|
|
int32_t non_trivial_move = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:TrivialMove",
|
|
|
|
[&](void* /*arg*/) { trivial_move++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:NonTrivial",
|
|
|
|
[&](void* /*arg*/) { non_trivial_move++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 100000000;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
int32_t value_size = 10 * 1024; // 10 KB
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
std::vector<std::string> values;
|
|
|
|
// File with keys [ 0 => 99 ]
|
|
|
|
for (int i = 0; i < 100; i++) {
|
|
|
|
values.push_back(rnd.RandomString(value_size));
|
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_EQ("1", FilesPerLevel(0));
|
|
|
|
// Compaction will do L0=>L1 (trivial move) then move L1 files to L3
|
|
|
|
CompactRangeOptions compact_options;
|
|
|
|
compact_options.change_level = true;
|
|
|
|
compact_options.target_level = 3;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
|
|
|
|
ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
|
|
|
|
ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(trivial_move, 1);
|
|
|
|
ASSERT_EQ(non_trivial_move, 0);
|
|
|
|
|
|
|
|
// File with keys [ 100 => 199 ]
|
|
|
|
for (int i = 100; i < 200; i++) {
|
|
|
|
values.push_back(rnd.RandomString(value_size));
|
|
|
|
ASSERT_OK(Put(Key(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = exclusive_manual_compaction_;
|
|
|
|
// Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(trivial_move, 4);
|
|
|
|
ASSERT_EQ(non_trivial_move, 0);
|
|
|
|
|
|
|
|
for (int i = 0; i < 200; i++) {
|
|
|
|
ASSERT_EQ(Get(Key(i)), values[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.db_paths.emplace_back(dbname_, 500 * 1024);
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
|
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.write_buffer_size = 110 << 10; // 110KB
|
|
|
|
options.arena_block_size = 4 << 10;
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.max_bytes_for_level_base = 400 * 1024;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
int key_idx = 0;
|
|
|
|
|
|
|
|
// First three 110KB files are not going to second path.
|
|
|
|
// After that, (100K, 200K)
|
|
|
|
for (int num = 0; num < 3; num++) {
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Another 110KB triggers a compaction to 400K file to fill up first path
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
|
|
|
|
// (1, 4)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
// (1, 4, 1)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,1", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
// (1, 4, 2)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,2", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
// (1, 4, 3)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,3", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
// (1, 4, 4)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,4", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
// (1, 4, 5)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,5", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
// (1, 4, 6)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,6", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
// (1, 4, 7)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,7", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
// (1, 4, 8)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,8", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
for (int i = 0; i < key_idx; i++) {
|
|
|
|
auto v = Get(Key(i));
|
|
|
|
ASSERT_NE(v, "NOT_FOUND");
|
|
|
|
ASSERT_TRUE(v.size() == 1 || v.size() == 990);
|
|
|
|
}
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
for (int i = 0; i < key_idx; i++) {
|
|
|
|
auto v = Get(Key(i));
|
|
|
|
ASSERT_NE(v, "NOT_FOUND");
|
|
|
|
ASSERT_TRUE(v.size() == 1 || v.size() == 990);
|
|
|
|
}
|
|
|
|
|
|
|
|
Destroy(options);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.db_paths.emplace_back(dbname_, 500 * 1024);
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
|
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.write_buffer_size = 110 << 10; // 110KB
|
|
|
|
options.arena_block_size = 4 << 10;
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.max_bytes_for_level_base = 400 * 1024;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
int key_idx = 0;
|
|
|
|
|
|
|
|
// Always gets compacted into 1 Level1 file,
|
|
|
|
// 0/1 Level 0 file
|
|
|
|
for (int num = 0; num < 3; num++) {
|
|
|
|
key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
}
|
|
|
|
|
|
|
|
key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
|
|
|
|
key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,1", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("0,1", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,1", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("0,1", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,1", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("0,1", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,1", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("0,1", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,1", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
for (int i = 0; i < key_idx; i++) {
|
|
|
|
auto v = Get(Key(i));
|
|
|
|
ASSERT_NE(v, "NOT_FOUND");
|
|
|
|
ASSERT_TRUE(v.size() == 1 || v.size() == 990);
|
|
|
|
}
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
for (int i = 0; i < key_idx; i++) {
|
|
|
|
auto v = Get(Key(i));
|
|
|
|
ASSERT_NE(v, "NOT_FOUND");
|
|
|
|
ASSERT_TRUE(v.size() == 1 || v.size() == 990);
|
|
|
|
}
|
|
|
|
|
|
|
|
Destroy(options);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, LevelCompactionCFPathUse) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.db_paths.emplace_back(dbname_, 500 * 1024);
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
|
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.write_buffer_size = 110 << 10; // 110KB
|
|
|
|
options.arena_block_size = 4 << 10;
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.max_bytes_for_level_base = 400 * 1024;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
|
|
|
|
std::vector<Options> option_vector;
|
|
|
|
option_vector.emplace_back(options);
|
|
|
|
ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
|
|
|
|
// Configure CF1 specific paths.
|
|
|
|
cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 500 * 1024);
|
|
|
|
cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 4 * 1024 * 1024);
|
|
|
|
cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 1024 * 1024 * 1024);
|
|
|
|
option_vector.emplace_back(DBOptions(options), cf_opt1);
|
|
|
|
CreateColumnFamilies({"one"}, option_vector[1]);
|
|
|
|
|
|
|
|
// Configure CF2 specific paths.
|
|
|
|
cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024);
|
|
|
|
cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024);
|
|
|
|
cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024);
|
|
|
|
option_vector.emplace_back(DBOptions(options), cf_opt2);
|
|
|
|
CreateColumnFamilies({"two"}, option_vector[2]);
|
|
|
|
|
|
|
|
ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
int key_idx = 0;
|
|
|
|
int key_idx1 = 0;
|
|
|
|
int key_idx2 = 0;
|
|
|
|
|
|
|
|
auto generate_file = [&]() {
|
|
|
|
GenerateNewFile(0, &rnd, &key_idx);
|
|
|
|
GenerateNewFile(1, &rnd, &key_idx1);
|
|
|
|
GenerateNewFile(2, &rnd, &key_idx2);
|
|
|
|
};
|
|
|
|
|
|
|
|
auto check_sstfilecount = [&](int path_id, int expected) {
|
|
|
|
ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
|
|
|
|
ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
|
|
|
|
ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
|
|
|
|
};
|
|
|
|
|
|
|
|
auto check_filesperlevel = [&](const std::string& expected) {
|
|
|
|
ASSERT_EQ(expected, FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(expected, FilesPerLevel(1));
|
|
|
|
ASSERT_EQ(expected, FilesPerLevel(2));
|
|
|
|
};
|
|
|
|
|
|
|
|
auto check_getvalues = [&]() {
|
|
|
|
for (int i = 0; i < key_idx; i++) {
|
|
|
|
auto v = Get(0, Key(i));
|
|
|
|
ASSERT_NE(v, "NOT_FOUND");
|
|
|
|
ASSERT_TRUE(v.size() == 1 || v.size() == 990);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < key_idx1; i++) {
|
|
|
|
auto v = Get(1, Key(i));
|
|
|
|
ASSERT_NE(v, "NOT_FOUND");
|
|
|
|
ASSERT_TRUE(v.size() == 1 || v.size() == 990);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < key_idx2; i++) {
|
|
|
|
auto v = Get(2, Key(i));
|
|
|
|
ASSERT_NE(v, "NOT_FOUND");
|
|
|
|
ASSERT_TRUE(v.size() == 1 || v.size() == 990);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Check that default column family uses db_paths.
|
|
|
|
// And Column family "one" uses cf_paths.
|
|
|
|
|
|
|
|
// The compaction in level0 outputs the sst files in level1.
|
|
|
|
// The first path cannot hold level1's data(400KB+400KB > 500KB),
|
|
|
|
// so every compaction move a sst file to second path. Please
|
|
|
|
// refer to LevelCompactionBuilder::GetPathId.
|
|
|
|
for (int num = 0; num < 3; num++) {
|
|
|
|
generate_file();
|
|
|
|
}
|
|
|
|
check_sstfilecount(0, 1);
|
|
|
|
check_sstfilecount(1, 2);
|
|
|
|
|
|
|
|
generate_file();
|
|
|
|
check_sstfilecount(1, 3);
|
|
|
|
|
|
|
|
// (1, 4)
|
|
|
|
generate_file();
|
|
|
|
check_filesperlevel("1,4");
|
|
|
|
check_sstfilecount(1, 4);
|
|
|
|
check_sstfilecount(0, 1);
|
|
|
|
|
|
|
|
// (1, 4, 1)
|
|
|
|
generate_file();
|
|
|
|
check_filesperlevel("1,4,1");
|
|
|
|
check_sstfilecount(2, 1);
|
|
|
|
check_sstfilecount(1, 4);
|
|
|
|
check_sstfilecount(0, 1);
|
|
|
|
|
|
|
|
// (1, 4, 2)
|
|
|
|
generate_file();
|
|
|
|
check_filesperlevel("1,4,2");
|
|
|
|
check_sstfilecount(2, 2);
|
|
|
|
check_sstfilecount(1, 4);
|
|
|
|
check_sstfilecount(0, 1);
|
|
|
|
|
|
|
|
check_getvalues();
|
|
|
|
|
|
|
|
{ // Also verify GetLiveFilesStorageInfo with db_paths / cf_paths
|
|
|
|
std::vector<LiveFileStorageInfo> new_infos;
|
|
|
|
LiveFilesStorageInfoOptions lfsio;
|
|
|
|
lfsio.wal_size_for_flush = UINT64_MAX; // no flush
|
|
|
|
ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsio, &new_infos));
|
|
|
|
std::unordered_map<std::string, int> live_sst_by_dir;
|
|
|
|
for (auto& info : new_infos) {
|
|
|
|
if (info.file_type == kTableFile) {
|
|
|
|
live_sst_by_dir[info.directory]++;
|
|
|
|
// Verify file on disk (no directory confusion)
|
|
|
|
uint64_t size;
|
|
|
|
ASSERT_OK(env_->GetFileSize(
|
|
|
|
info.directory + "/" + info.relative_filename, &size));
|
|
|
|
ASSERT_EQ(info.size, size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_EQ(3U * 3U, live_sst_by_dir.size());
|
|
|
|
for (auto& paths : {options.db_paths, cf_opt1.cf_paths, cf_opt2.cf_paths}) {
|
|
|
|
ASSERT_EQ(1, live_sst_by_dir[paths[0].path]);
|
|
|
|
ASSERT_EQ(4, live_sst_by_dir[paths[1].path]);
|
|
|
|
ASSERT_EQ(2, live_sst_by_dir[paths[2].path]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
|
|
|
|
|
|
|
|
check_getvalues();
|
|
|
|
|
|
|
|
Destroy(options, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) {
|
|
|
|
Random rnd(301);
|
|
|
|
int max_key_level_insert = 200;
|
|
|
|
int max_key_universal_insert = 600;
|
|
|
|
|
|
|
|
// Stage 1: generate a db with level compaction
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 110 << 10; // 110KB
|
|
|
|
options.arena_block_size = 4 << 10;
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
options.max_bytes_for_level_base = 500 << 10; // 500KB
|
|
|
|
options.max_bytes_for_level_multiplier = 1;
|
|
|
|
options.target_file_size_base = 200 << 10; // 200KB
|
|
|
|
options.target_file_size_multiplier = 1;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
|
|
|
for (int i = 0; i <= max_key_level_insert; i++) {
|
|
|
|
// each value is 10K
|
|
|
|
ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_GT(TotalTableFiles(1, 4), 1);
|
|
|
|
int non_level0_num_files = 0;
|
|
|
|
for (int i = 1; i < options.num_levels; i++) {
|
|
|
|
non_level0_num_files += NumTableFilesAtLevel(i, 1);
|
|
|
|
}
|
|
|
|
ASSERT_GT(non_level0_num_files, 0);
|
|
|
|
|
|
|
|
// Stage 2: reopen with universal compaction - should fail
|
|
|
|
options = CurrentOptions();
|
|
|
|
options.compaction_style = kCompactionStyleUniversal;
|
|
|
|
options.num_levels = 1;
|
|
|
|
options = CurrentOptions(options);
|
|
|
|
Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
|
|
|
|
// Stage 3: compact into a single file and move the file to level 0
|
|
|
|
options = CurrentOptions();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.target_file_size_base = INT_MAX;
|
|
|
|
options.target_file_size_multiplier = 1;
|
|
|
|
options.max_bytes_for_level_base = INT_MAX;
|
|
|
|
options.max_bytes_for_level_multiplier = 1;
|
|
|
|
options.num_levels = 4;
|
|
|
|
options = CurrentOptions(options);
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, options);
|
|
|
|
|
|
|
|
CompactRangeOptions compact_options;
|
|
|
|
compact_options.change_level = true;
|
|
|
|
compact_options.target_level = 0;
|
|
|
|
// cannot use kForceOptimized here because the compaction here is expected
|
|
|
|
// to generate one output file
|
|
|
|
compact_options.bottommost_level_compaction =
|
|
|
|
BottommostLevelCompaction::kForce;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
|
|
|
|
ASSERT_OK(
|
|
|
|
dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr));
|
|
|
|
|
|
|
|
// Only 1 file in L0
|
|
|
|
ASSERT_EQ("1", FilesPerLevel(1));
|
|
|
|
|
|
|
|
// Stage 4: re-open in universal compaction style and do some db operations
|
|
|
|
options = CurrentOptions();
|
|
|
|
options.compaction_style = kCompactionStyleUniversal;
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.write_buffer_size = 110 << 10; // 110KB
|
|
|
|
options.arena_block_size = 4 << 10;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
options = CurrentOptions(options);
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, options);
|
|
|
|
|
|
|
|
options.num_levels = 1;
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, options);
|
|
|
|
|
|
|
|
for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
|
|
|
|
ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->Flush(FlushOptions()));
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
for (int i = 1; i < options.num_levels; i++) {
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// verify keys inserted in both level compaction style and universal
|
|
|
|
// compaction style
|
|
|
|
std::string keys_in_db;
|
|
|
|
Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
keys_in_db.append(iter->key().ToString());
|
|
|
|
keys_in_db.push_back(',');
|
|
|
|
}
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
std::string expected_keys;
|
|
|
|
for (int i = 0; i <= max_key_universal_insert; i++) {
|
|
|
|
expected_keys.append(Key(i));
|
|
|
|
expected_keys.push_back(',');
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_EQ(keys_in_db, expected_keys);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_a) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "b", "v"));
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Delete(1, "b"));
|
|
|
|
ASSERT_OK(Delete(1, "a"));
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Delete(1, "a"));
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "a", "v"));
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_EQ("(a->v)", Contents(1));
|
|
|
|
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
|
|
|
|
ASSERT_EQ("(a->v)", Contents(1));
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "", ""));
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Delete(1, "e"));
|
|
|
|
ASSERT_OK(Put(1, "", ""));
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "c", "cv"));
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "", ""));
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "", ""));
|
|
|
|
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "d", "dv"));
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "", ""));
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Delete(1, "d"));
|
|
|
|
ASSERT_OK(Delete(1, "b"));
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_EQ("(->)(c->cv)", Contents(1));
|
|
|
|
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
|
|
|
|
ASSERT_EQ("(->)(c->cv)", Contents(1));
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, ManualAutoRace) {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"},
|
|
|
|
{"DBImpl::RunManualCompaction:WaitScheduled",
|
|
|
|
"BackgroundCallCompaction:0"}});
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
ASSERT_OK(Put(1, "foo", ""));
|
|
|
|
ASSERT_OK(Put(1, "bar", ""));
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_OK(Put(1, "foo", ""));
|
|
|
|
ASSERT_OK(Put(1, "bar", ""));
|
|
|
|
// Generate four files in CF 0, which should trigger an auto compaction
|
|
|
|
ASSERT_OK(Put("foo", ""));
|
|
|
|
ASSERT_OK(Put("bar", ""));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("foo", ""));
|
|
|
|
ASSERT_OK(Put("bar", ""));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("foo", ""));
|
|
|
|
ASSERT_OK(Put("bar", ""));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("foo", ""));
|
|
|
|
ASSERT_OK(Put("bar", ""));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// The auto compaction is scheduled but waited until here
|
|
|
|
TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1");
|
|
|
|
// The auto compaction will wait until the manual compaction is registerd
|
|
|
|
// before processing so that it will be cancelled.
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = true;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, handles_[1], nullptr, nullptr));
|
|
|
|
ASSERT_EQ("0,1", FilesPerLevel(1));
|
|
|
|
|
|
|
|
// Eventually the cancelled compaction will be rescheduled and executed.
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ("0,1", FilesPerLevel(0));
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, ManualCompaction) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
|
|
|
// iter - 0 with 7 levels
|
|
|
|
// iter - 1 with 3 levels
|
|
|
|
for (int iter = 0; iter < 2; ++iter) {
|
|
|
|
MakeTables(3, "p", "q", 1);
|
|
|
|
ASSERT_EQ("1,1,1", FilesPerLevel(1));
|
|
|
|
|
|
|
|
// Compaction range falls before files
|
|
|
|
Compact(1, "", "c");
|
|
|
|
ASSERT_EQ("1,1,1", FilesPerLevel(1));
|
|
|
|
|
|
|
|
// Compaction range falls after files
|
|
|
|
Compact(1, "r", "z");
|
|
|
|
ASSERT_EQ("1,1,1", FilesPerLevel(1));
|
|
|
|
|
|
|
|
// Compaction range overlaps files
|
|
|
|
Compact(1, "p", "q");
|
|
|
|
ASSERT_EQ("0,0,1", FilesPerLevel(1));
|
|
|
|
|
|
|
|
// Populate a different range
|
|
|
|
MakeTables(3, "c", "e", 1);
|
|
|
|
ASSERT_EQ("1,1,2", FilesPerLevel(1));
|
|
|
|
|
|
|
|
// Compact just the new range
|
|
|
|
Compact(1, "b", "f");
|
|
|
|
ASSERT_EQ("0,0,2", FilesPerLevel(1));
|
|
|
|
|
|
|
|
// Compact all
|
|
|
|
MakeTables(1, "a", "z", 1);
|
|
|
|
ASSERT_EQ("1,0,2", FilesPerLevel(1));
|
|
|
|
|
|
|
|
uint64_t prev_block_cache_add =
|
|
|
|
options.statistics->getTickerCount(BLOCK_CACHE_ADD);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = exclusive_manual_compaction_;
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr));
|
|
|
|
// Verify manual compaction doesn't fill block cache
|
|
|
|
ASSERT_EQ(prev_block_cache_add,
|
|
|
|
options.statistics->getTickerCount(BLOCK_CACHE_ADD));
|
|
|
|
|
|
|
|
ASSERT_EQ("0,0,1", FilesPerLevel(1));
|
|
|
|
|
|
|
|
if (iter == 0) {
|
|
|
|
options = CurrentOptions();
|
|
|
|
options.num_levels = 3;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
|
|
|
// iter - 0 with 7 levels
|
|
|
|
// iter - 1 with 3 levels
|
|
|
|
for (int iter = 0; iter < 2; ++iter) {
|
|
|
|
for (int i = 0; i < 3; ++i) {
|
|
|
|
ASSERT_OK(Put(1, "p", "begin"));
|
|
|
|
ASSERT_OK(Put(1, "q", "end"));
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
}
|
|
|
|
ASSERT_EQ("3", FilesPerLevel(1));
|
|
|
|
ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
// Compaction range falls before files
|
|
|
|
Compact(1, "", "c");
|
|
|
|
ASSERT_EQ("3", FilesPerLevel(1));
|
|
|
|
|
|
|
|
// Compaction range falls after files
|
|
|
|
Compact(1, "r", "z");
|
|
|
|
ASSERT_EQ("3", FilesPerLevel(1));
|
|
|
|
|
|
|
|
// Compaction range overlaps files
|
|
|
|
Compact(1, "p", "q", 1);
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ("0,1", FilesPerLevel(1));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
// Populate a different range
|
|
|
|
for (int i = 0; i < 3; ++i) {
|
|
|
|
ASSERT_OK(Put(1, "c", "begin"));
|
|
|
|
ASSERT_OK(Put(1, "e", "end"));
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
}
|
|
|
|
ASSERT_EQ("3,1", FilesPerLevel(1));
|
|
|
|
|
|
|
|
// Compact just the new range
|
|
|
|
Compact(1, "b", "f", 1);
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ("0,2", FilesPerLevel(1));
|
|
|
|
ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
// Compact all
|
|
|
|
ASSERT_OK(Put(1, "a", "begin"));
|
|
|
|
ASSERT_OK(Put(1, "z", "end"));
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_EQ("1,2", FilesPerLevel(1));
|
|
|
|
ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
|
|
|
|
CompactRangeOptions compact_options;
|
|
|
|
compact_options.target_path_id = 1;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_EQ("0,1", FilesPerLevel(1));
|
|
|
|
ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
|
|
|
|
ASSERT_EQ(0, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
if (iter == 0) {
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
options = CurrentOptions();
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
|
|
|
|
options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
|
|
|
|
options.max_background_flushes = 1;
|
|
|
|
options.num_levels = 3;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, FilesDeletedAfterCompaction) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "foo", "v2"));
|
|
|
|
Compact(1, "a", "z");
|
|
|
|
const size_t num_files = CountLiveFiles();
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
ASSERT_OK(Put(1, "foo", "v2"));
|
|
|
|
Compact(1, "a", "z");
|
|
|
|
}
|
|
|
|
ASSERT_EQ(CountLiveFiles(), num_files);
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
Fix CompactFiles by adding all necessary files
Summary:
The compact files API had a bug where some overlapping files
are not added. These are files which overlap with files which were
added to the compaction input files, but not to the original set of
input files. This happens only when there are more than two levels
involved in the compaction. An example will illustrate this better.
Level 2 has 1 input file 1.sst which spans [20,30].
Level 3 has added file 2.sst which spans [10,25]
Level 4 has file 3.sst which spans [35,40] and
input file 4.sst which spans [46,50].
The existing code would not add 3.sst to the set of input_files because
it only becomes an overlapping file in level 4 and it wasn't one in
level 3.
When installing the results of the compaction, 3.sst would overlap with
output file from the compact files and result in the assertion in
version_set.cc:1130
// Must not overlap
assert(level <= 0 || level_files->empty() ||
internal_comparator_->Compare(
(*level_files)[level_files->size() - 1]->largest, f->smallest) <
0);
This change now adds overlapping files from the current level to the set
of input files also so that we don't hit the assertion above.
Test Plan:
d=/tmp/j; rm -rf $d; seq 1000 | parallel --gnu --eta
'd=/tmp/j/d-{}; mkdir -p $d; TEST_TMPDIR=$d ./db_compaction_test
--gtest_filter=*CompactilesOnLevel* --gtest_also_run_disabled_tests >&
'$d'/log-{}'
Reviewers: igor, yhchiang, sdong
Reviewed By: yhchiang
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43437
10 years ago
|
|
|
// Check level comapction with compact files
|
|
|
|
TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) {
|
|
|
|
const int kTestKeySize = 16;
|
|
|
|
const int kTestValueSize = 984;
|
|
|
|
const int kEntrySize = kTestKeySize + kTestValueSize;
|
|
|
|
const int kEntriesPerBuffer = 100;
|
|
|
|
Options options;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.target_file_size_base = options.write_buffer_size;
|
|
|
|
options.max_bytes_for_level_base = options.target_file_size_base * 2;
|
|
|
|
options.level0_stop_writes_trigger = 2;
|
|
|
|
options.max_bytes_for_level_multiplier = 2;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
options = CurrentOptions(options);
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
|
|
|
|
ASSERT_OK(Put(1, std::to_string(key), rnd.RandomString(kTestValueSize)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ColumnFamilyMetaData cf_meta;
|
|
|
|
dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
|
|
|
|
int output_level = static_cast<int>(cf_meta.levels.size()) - 1;
|
|
|
|
for (int file_picked = 5; file_picked > 0; --file_picked) {
|
|
|
|
std::set<std::string> overlapping_file_names;
|
|
|
|
std::vector<std::string> compaction_input_file_names;
|
|
|
|
for (int f = 0; f < file_picked; ++f) {
|
|
|
|
int level = 0;
|
|
|
|
auto file_meta = PickFileRandomly(cf_meta, &rnd, &level);
|
|
|
|
compaction_input_file_names.push_back(file_meta->name);
|
|
|
|
GetOverlappingFileNumbersForLevelCompaction(
|
|
|
|
cf_meta, options.comparator, level, output_level, file_meta,
|
|
|
|
&overlapping_file_names);
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1],
|
|
|
|
compaction_input_file_names,
|
|
|
|
output_level));
|
|
|
|
|
|
|
|
// Make sure all overlapping files do not exist after compaction
|
|
|
|
dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
|
|
|
|
VerifyCompactionResult(cf_meta, overlapping_file_names);
|
|
|
|
}
|
|
|
|
|
|
|
|
// make sure all key-values are still there.
|
|
|
|
for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
|
|
|
|
ASSERT_NE(Get(1, std::to_string(key)), "NOT_FOUND");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) {
|
|
|
|
Options options;
|
|
|
|
const int kKeySize = 16;
|
|
|
|
const int kKvSize = 1000;
|
|
|
|
const int kKeysPerBuffer = 100;
|
|
|
|
const int kNumL1Files = 5;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.write_buffer_size = kKeysPerBuffer * kKvSize;
|
|
|
|
options.max_write_buffer_number = 2;
|
|
|
|
options.target_file_size_base =
|
|
|
|
options.write_buffer_size * (options.max_write_buffer_number - 1);
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL1Files;
|
|
|
|
options.max_bytes_for_level_base =
|
|
|
|
options.level0_file_num_compaction_trigger *
|
|
|
|
options.target_file_size_base;
|
|
|
|
options.max_bytes_for_level_multiplier = 2;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
|
|
|
|
env_->SetBackgroundThreads(1, Env::HIGH);
|
|
|
|
env_->SetBackgroundThreads(1, Env::LOW);
|
|
|
|
// stop the compaction thread until we simulate the file creation failure.
|
|
|
|
test::SleepingBackgroundTask sleeping_task_low;
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
|
|
|
Env::Priority::LOW);
|
|
|
|
|
|
|
|
options.env = env_;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
const int kNumInsertedKeys = options.level0_file_num_compaction_trigger *
|
|
|
|
(options.max_write_buffer_number - 1) *
|
|
|
|
kKeysPerBuffer;
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
for (int k = 0; k < kNumInsertedKeys; ++k) {
|
|
|
|
keys.emplace_back(rnd.RandomString(kKeySize));
|
|
|
|
values.emplace_back(rnd.RandomString(kKvSize - kKeySize));
|
|
|
|
ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
|
|
|
|
// Make sure the number of L0 files can trigger compaction.
|
|
|
|
ASSERT_GE(NumTableFilesAtLevel(0),
|
|
|
|
options.level0_file_num_compaction_trigger);
|
|
|
|
|
|
|
|
auto previous_num_level0_files = NumTableFilesAtLevel(0);
|
|
|
|
|
|
|
|
// Fail the first file creation.
|
|
|
|
env_->non_writable_count_ = 1;
|
|
|
|
sleeping_task_low.WakeUp();
|
|
|
|
sleeping_task_low.WaitUntilDone();
|
|
|
|
|
|
|
|
// Expect compaction to fail here as one file will fail its
|
|
|
|
// creation.
|
|
|
|
ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok());
|
|
|
|
|
|
|
|
// Verify L0 -> L1 compaction does fail.
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(1), 0);
|
|
|
|
|
|
|
|
// Verify all L0 files are still there.
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
|
|
|
|
|
|
|
|
// All key-values must exist after compaction fails.
|
|
|
|
for (int k = 0; k < kNumInsertedKeys; ++k) {
|
|
|
|
ASSERT_EQ(values[k], Get(keys[k]));
|
|
|
|
}
|
|
|
|
|
|
|
|
env_->non_writable_count_ = 0;
|
|
|
|
|
|
|
|
// Make sure RocksDB will not get into corrupted state.
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// Verify again after reopen.
|
|
|
|
for (int k = 0; k < kNumInsertedKeys; ++k) {
|
|
|
|
ASSERT_EQ(values[k], Get(keys[k]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) {
|
|
|
|
// iter 1 -- delete_obsolete_files_period_micros == 0
|
|
|
|
for (int iter = 0; iter < 2; ++iter) {
|
|
|
|
// This test triggers move compaction and verifies that the file is not
|
|
|
|
// deleted when it's part of move compaction
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
if (iter == 1) {
|
|
|
|
options.delete_obsolete_files_period_micros = 0;
|
|
|
|
}
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.level0_file_num_compaction_trigger =
|
|
|
|
2; // trigger compaction when we have 2 files
|
|
|
|
OnFileDeletionListener* listener = new OnFileDeletionListener();
|
|
|
|
options.listeners.emplace_back(listener);
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
// Create two 1MB sst files
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
// Create 1MB sst file
|
|
|
|
for (int j = 0; j < 100; ++j) {
|
|
|
|
ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
// this should execute L0->L1
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ("0,1", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// block compactions
|
|
|
|
test::SleepingBackgroundTask sleeping_task;
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
|
|
|
|
Env::Priority::LOW);
|
|
|
|
|
|
|
|
options.max_bytes_for_level_base = 1024 * 1024; // 1 MB
|
|
|
|
Reopen(options);
|
|
|
|
std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
|
|
|
|
ASSERT_EQ("0,1", FilesPerLevel(0));
|
|
|
|
// let compactions go
|
|
|
|
sleeping_task.WakeUp();
|
|
|
|
sleeping_task.WaitUntilDone();
|
|
|
|
|
|
|
|
// this should execute L1->L2 (move)
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_EQ("0,0,1", FilesPerLevel(0));
|
|
|
|
|
|
|
|
std::vector<LiveFileMetaData> metadata;
|
|
|
|
db_->GetLiveFilesMetaData(&metadata);
|
|
|
|
ASSERT_EQ(metadata.size(), 1U);
|
|
|
|
auto moved_file_name = metadata[0].name;
|
|
|
|
|
|
|
|
// Create two more 1MB sst files
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
// Create 1MB sst file
|
|
|
|
for (int j = 0; j < 100; ++j) {
|
|
|
|
ASSERT_OK(Put(Key(i * 50 + j + 100), rnd.RandomString(10 * 1024)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
// this should execute both L0->L1 and L1->L2 (merge with previous file)
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_EQ("0,0,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// iterator is holding the file
|
|
|
|
ASSERT_OK(env_->FileExists(dbname_ + moved_file_name));
|
|
|
|
|
|
|
|
listener->SetExpectedFileName(dbname_ + moved_file_name);
|
|
|
|
ASSERT_OK(iterator->status());
|
|
|
|
iterator.reset();
|
|
|
|
|
|
|
|
// this file should have been compacted away
|
|
|
|
ASSERT_NOK(env_->FileExists(dbname_ + moved_file_name));
|
|
|
|
listener->VerifyMatchedCount(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) {
|
|
|
|
if (!Zlib_Supported()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.write_buffer_size = 110 << 10; // 110KB
|
|
|
|
options.arena_block_size = 4 << 10;
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.max_bytes_for_level_base = 400 * 1024;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
// First two levels have no compression, so that a trivial move between
|
|
|
|
// them will be allowed. Level 2 has Zlib compression so that a trivial
|
|
|
|
// move to level 3 will not be allowed
|
|
|
|
options.compression_per_level = {kNoCompression, kNoCompression,
|
|
|
|
kZlibCompression};
|
|
|
|
int matches = 0, didnt_match = 0, trivial_move = 0, non_trivial = 0;
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"Compaction::InputCompressionMatchesOutput:Matches",
|
|
|
|
[&](void* /*arg*/) { matches++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"Compaction::InputCompressionMatchesOutput:DidntMatch",
|
|
|
|
[&](void* /*arg*/) { didnt_match++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:NonTrivial",
|
|
|
|
[&](void* /*arg*/) { non_trivial++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:TrivialMove",
|
|
|
|
[&](void* /*arg*/) { trivial_move++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
int key_idx = 0;
|
|
|
|
|
|
|
|
// First three 110KB files are going to level 0
|
|
|
|
// After that, (100K, 200K)
|
|
|
|
for (int num = 0; num < 3; num++) {
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Another 110KB triggers a compaction to 400K file to fill up level 0
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ(4, GetSstFileCount(dbname_));
|
|
|
|
|
|
|
|
// (1, 4)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// (1, 4, 1)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,1", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// (1, 4, 2)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// (1, 4, 3)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,3", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// (1, 4, 4)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,4", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// (1, 4, 5)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,5", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// (1, 4, 6)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,6", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// (1, 4, 7)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,7", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// (1, 4, 8)
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,4,8", FilesPerLevel(0));
|
|
|
|
|
|
|
|
ASSERT_EQ(matches, 12);
|
|
|
|
// Currently, the test relies on the number of calls to
|
|
|
|
// InputCompressionMatchesOutput() per compaction.
|
|
|
|
const int kCallsToInputCompressionMatch = 2;
|
|
|
|
ASSERT_EQ(didnt_match, 8 * kCallsToInputCompressionMatch);
|
|
|
|
ASSERT_EQ(trivial_move, 12);
|
|
|
|
ASSERT_EQ(non_trivial, 8);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
|
|
|
|
for (int i = 0; i < key_idx; i++) {
|
|
|
|
auto v = Get(Key(i));
|
|
|
|
ASSERT_NE(v, "NOT_FOUND");
|
|
|
|
ASSERT_TRUE(v.size() == 1 || v.size() == 990);
|
|
|
|
}
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
for (int i = 0; i < key_idx; i++) {
|
|
|
|
auto v = Get(Key(i));
|
|
|
|
ASSERT_NE(v, "NOT_FOUND");
|
|
|
|
ASSERT_TRUE(v.size() == 1 || v.size() == 990);
|
|
|
|
}
|
|
|
|
|
|
|
|
Destroy(options);
|
|
|
|
}
|
|
|
|
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
TEST_F(DBCompactionTest, SanitizeCompactionOptionsTest) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.max_background_compactions = 5;
|
|
|
|
options.soft_pending_compaction_bytes_limit = 0;
|
|
|
|
options.hard_pending_compaction_bytes_limit = 100;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_EQ(100, db_->GetOptions().soft_pending_compaction_bytes_limit);
|
|
|
|
|
|
|
|
options.max_background_compactions = 3;
|
|
|
|
options.soft_pending_compaction_bytes_limit = 200;
|
|
|
|
options.hard_pending_compaction_bytes_limit = 150;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_EQ(150, db_->GetOptions().soft_pending_compaction_bytes_limit);
|
|
|
|
}
|
|
|
|
|
|
|
|
// This tests for a bug that could cause two level0 compactions running
|
|
|
|
// concurrently
|
|
|
|
// TODO(aekmekji): Make sure that the reason this fails when run with
|
|
|
|
// max_subcompactions > 1 is not a correctness issue but just inherent to
|
|
|
|
// running parallel L0-L1 compactions
|
|
|
|
TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.write_buffer_size = 110 << 10;
|
|
|
|
options.arena_block_size = 4 << 10;
|
|
|
|
options.level0_file_num_compaction_trigger = 4;
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.max_bytes_for_level_base = 450 << 10;
|
|
|
|
options.target_file_size_base = 98 << 10;
|
|
|
|
options.max_write_buffer_number = 2;
|
|
|
|
options.max_background_compactions = 2;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// fill up the DB
|
|
|
|
Random rnd(301);
|
|
|
|
for (int num = 0; num < 10; num++) {
|
|
|
|
GenerateNewRandomFile(&rnd);
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"CompactionJob::Run():Start",
|
|
|
|
"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"},
|
|
|
|
{"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2",
|
|
|
|
"CompactionJob::Run():End"}});
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// trigger L0 compaction
|
|
|
|
for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
|
|
|
|
num++) {
|
|
|
|
GenerateNewRandomFile(&rnd, /* nowait */ true);
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1");
|
|
|
|
|
|
|
|
GenerateNewRandomFile(&rnd, /* nowait */ true);
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
|
|
|
|
for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
|
|
|
|
num++) {
|
|
|
|
GenerateNewRandomFile(&rnd, /* nowait */ true);
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2");
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::string ShortKey(int i) {
|
|
|
|
assert(i < 10000);
|
|
|
|
char buf[100];
|
|
|
|
snprintf(buf, sizeof(buf), "key%04d", i);
|
|
|
|
return std::string(buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
|
|
|
|
int32_t trivial_move = 0;
|
|
|
|
int32_t non_trivial_move = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:TrivialMove",
|
|
|
|
[&](void* /*arg*/) { trivial_move++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:NonTrivial",
|
|
|
|
[&](void* /*arg*/) { non_trivial_move++; });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// The key size is guaranteed to be <= 8
|
|
|
|
class ShortKeyComparator : public Comparator {
|
|
|
|
int Compare(const ROCKSDB_NAMESPACE::Slice& a,
|
|
|
|
const ROCKSDB_NAMESPACE::Slice& b) const override {
|
|
|
|
assert(a.size() <= 8);
|
|
|
|
assert(b.size() <= 8);
|
|
|
|
return BytewiseComparator()->Compare(a, b);
|
|
|
|
}
|
|
|
|
const char* Name() const override { return "ShortKeyComparator"; }
|
|
|
|
void FindShortestSeparator(
|
|
|
|
std::string* start,
|
|
|
|
const ROCKSDB_NAMESPACE::Slice& limit) const override {
|
|
|
|
return BytewiseComparator()->FindShortestSeparator(start, limit);
|
|
|
|
}
|
|
|
|
void FindShortSuccessor(std::string* key) const override {
|
|
|
|
return BytewiseComparator()->FindShortSuccessor(key);
|
|
|
|
}
|
|
|
|
} short_key_cmp;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.target_file_size_base = 100000000;
|
|
|
|
options.write_buffer_size = 100000000;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
options.comparator = &short_key_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
int32_t value_size = 10 * 1024; // 10 KB
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
std::vector<std::string> values;
|
|
|
|
// File with keys [ 0 => 99 ]
|
|
|
|
for (int i = 0; i < 100; i++) {
|
|
|
|
values.push_back(rnd.RandomString(value_size));
|
|
|
|
ASSERT_OK(Put(ShortKey(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_EQ("1", FilesPerLevel(0));
|
|
|
|
// Compaction will do L0=>L1 (trivial move) then move L1 files to L3
|
|
|
|
CompactRangeOptions compact_options;
|
|
|
|
compact_options.change_level = true;
|
|
|
|
compact_options.target_level = 3;
|
|
|
|
ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
|
|
|
|
ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(trivial_move, 1);
|
|
|
|
ASSERT_EQ(non_trivial_move, 0);
|
|
|
|
|
|
|
|
// File with keys [ 100 => 199 ]
|
|
|
|
for (int i = 100; i < 200; i++) {
|
|
|
|
values.push_back(rnd.RandomString(value_size));
|
|
|
|
ASSERT_OK(Put(ShortKey(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
|
|
|
|
// Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
|
|
|
|
// then compacte the bottommost level L3=>L3 (non trivial move)
|
|
|
|
compact_options = CompactRangeOptions();
|
|
|
|
compact_options.bottommost_level_compaction =
|
|
|
|
BottommostLevelCompaction::kForceOptimized;
|
|
|
|
ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
|
|
|
|
ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(trivial_move, 4);
|
|
|
|
ASSERT_EQ(non_trivial_move, 1);
|
|
|
|
|
|
|
|
// File with keys [ 200 => 299 ]
|
|
|
|
for (int i = 200; i < 300; i++) {
|
|
|
|
values.push_back(rnd.RandomString(value_size));
|
|
|
|
ASSERT_OK(Put(ShortKey(i), values[i]));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
|
|
|
|
trivial_move = 0;
|
|
|
|
non_trivial_move = 0;
|
|
|
|
compact_options = CompactRangeOptions();
|
|
|
|
compact_options.bottommost_level_compaction =
|
|
|
|
BottommostLevelCompaction::kSkip;
|
|
|
|
// Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
|
|
|
|
// and will skip bottommost level compaction
|
|
|
|
ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
|
|
|
|
ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
|
|
|
|
ASSERT_EQ(trivial_move, 3);
|
|
|
|
ASSERT_EQ(non_trivial_move, 0);
|
|
|
|
|
|
|
|
for (int i = 0; i < 300; i++) {
|
|
|
|
ASSERT_EQ(Get(ShortKey(i)), values[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, IntraL0Compaction) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.level0_file_num_compaction_trigger = 5;
|
|
|
|
options.max_background_compactions = 2;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
options.write_buffer_size = 2 << 20; // 2MB
|
|
|
|
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_cache = NewLRUCache(64 << 20); // 64MB
|
|
|
|
table_options.cache_index_and_filter_blocks = true;
|
|
|
|
table_options.pin_l0_filter_and_index_blocks_in_cache = true;
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
4 years ago
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
const size_t kValueSize = 1 << 20;
|
|
|
|
Random rnd(301);
|
|
|
|
std::string value(rnd.RandomString(kValueSize));
|
|
|
|
|
|
|
|
// The L0->L1 must be picked before we begin flushing files to trigger
|
|
|
|
// intra-L0 compaction, and must not finish until after an intra-L0
|
|
|
|
// compaction has been picked.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"LevelCompactionPicker::PickCompaction:Return",
|
|
|
|
"DBCompactionTest::IntraL0Compaction:L0ToL1Ready"},
|
|
|
|
{"LevelCompactionPicker::PickCompactionBySize:0",
|
|
|
|
"CompactionJob::Run():Start"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// index: 0 1 2 3 4 5 6 7 8 9
|
|
|
|
// size: 1MB 1MB 1MB 1MB 1MB 2MB 1MB 1MB 1MB 1MB
|
|
|
|
// score: 1.5 1.3 1.5 2.0 inf
|
|
|
|
//
|
|
|
|
// Files 0-4 will be included in an L0->L1 compaction.
|
|
|
|
//
|
|
|
|
// L0->L0 will be triggered since the sync points guarantee compaction to base
|
|
|
|
// level is still blocked when files 5-9 trigger another compaction.
|
|
|
|
//
|
|
|
|
// Files 6-9 are the longest span of available files for which
|
|
|
|
// work-per-deleted-file decreases (see "score" row above).
|
|
|
|
for (int i = 0; i < 10; ++i) {
|
|
|
|
ASSERT_OK(Put(Key(0), "")); // prevents trivial move
|
|
|
|
if (i == 5) {
|
|
|
|
TEST_SYNC_POINT("DBCompactionTest::IntraL0Compaction:L0ToL1Ready");
|
|
|
|
ASSERT_OK(Put(Key(i + 1), value + value));
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(Put(Key(i + 1), value));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
|
|
|
|
std::vector<std::vector<FileMetaData>> level_to_files;
|
|
|
|
dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
|
|
|
|
&level_to_files);
|
|
|
|
ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1
|
|
|
|
// L0 has the 2MB file (not compacted) and 4MB file (output of L0->L0)
|
|
|
|
ASSERT_EQ(2, level_to_files[0].size());
|
|
|
|
ASSERT_GT(level_to_files[1].size(), 0);
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
ASSERT_GE(level_to_files[0][i].fd.file_size, 1 << 21);
|
|
|
|
}
|
|
|
|
|
|
|
|
// The index/filter in the file produced by intra-L0 should not be pinned.
|
|
|
|
// That means clearing unref'd entries in block cache and re-accessing the
|
|
|
|
// file produced by intra-L0 should bump the index block miss count.
|
|
|
|
uint64_t prev_index_misses =
|
|
|
|
TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
|
|
|
|
table_options.block_cache->EraseUnRefEntries();
|
|
|
|
ASSERT_EQ("", Get(Key(0)));
|
|
|
|
ASSERT_EQ(prev_index_misses + 1,
|
|
|
|
TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) {
|
|
|
|
// regression test for issue #2722: L0->L0 compaction can resurrect deleted
|
|
|
|
// keys from older L0 files if L1+ files' key-ranges do not include the key.
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.level0_file_num_compaction_trigger = 5;
|
|
|
|
options.max_background_compactions = 2;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
const size_t kValueSize = 1 << 20;
|
|
|
|
Random rnd(301);
|
|
|
|
std::string value(rnd.RandomString(kValueSize));
|
|
|
|
|
|
|
|
// The L0->L1 must be picked before we begin flushing files to trigger
|
|
|
|
// intra-L0 compaction, and must not finish until after an intra-L0
|
|
|
|
// compaction has been picked.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"LevelCompactionPicker::PickCompaction:Return",
|
|
|
|
"DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:"
|
|
|
|
"L0ToL1Ready"},
|
|
|
|
{"LevelCompactionPicker::PickCompactionBySize:0",
|
|
|
|
"CompactionJob::Run():Start"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// index: 0 1 2 3 4 5 6 7 8 9
|
|
|
|
// size: 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB
|
|
|
|
// score: 1.25 1.33 1.5 2.0 inf
|
|
|
|
//
|
|
|
|
// Files 0-4 will be included in an L0->L1 compaction.
|
|
|
|
//
|
|
|
|
// L0->L0 will be triggered since the sync points guarantee compaction to base
|
|
|
|
// level is still blocked when files 5-9 trigger another compaction. All files
|
|
|
|
// 5-9 are included in the L0->L0 due to work-per-deleted file decreasing.
|
|
|
|
//
|
|
|
|
// Put a key-value in files 0-4. Delete that key in files 5-9. Verify the
|
|
|
|
// L0->L0 preserves the deletion such that the key remains deleted.
|
|
|
|
for (int i = 0; i < 10; ++i) {
|
|
|
|
// key 0 serves both to prevent trivial move and as the key we want to
|
|
|
|
// verify is not resurrected by L0->L0 compaction.
|
|
|
|
if (i < 5) {
|
|
|
|
ASSERT_OK(Put(Key(0), ""));
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(Delete(Key(0)));
|
|
|
|
}
|
|
|
|
if (i == 5) {
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:"
|
|
|
|
"L0ToL1Ready");
|
|
|
|
}
|
|
|
|
ASSERT_OK(Put(Key(i + 1), value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
|
|
|
|
std::vector<std::vector<FileMetaData>> level_to_files;
|
|
|
|
dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
|
|
|
|
&level_to_files);
|
|
|
|
ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1
|
|
|
|
// L0 has a single output file from L0->L0
|
|
|
|
ASSERT_EQ(1, level_to_files[0].size());
|
|
|
|
ASSERT_GT(level_to_files[1].size(), 0);
|
|
|
|
ASSERT_GE(level_to_files[0][0].fd.file_size, 1 << 22);
|
|
|
|
|
|
|
|
ReadOptions roptions;
|
|
|
|
std::string result;
|
|
|
|
ASSERT_TRUE(db_->Get(roptions, Key(0), &result).IsNotFound());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) {
|
|
|
|
const int kNumFilesTrigger = 3;
|
|
|
|
Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
|
|
|
|
for (bool use_universal_compaction : {false, true}) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
if (use_universal_compaction) {
|
|
|
|
options.compaction_style = kCompactionStyleUniversal;
|
|
|
|
} else {
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.level_compaction_dynamic_level_bytes = true;
|
|
|
|
}
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.write_buffer_size = 100 << 10; // 100KB
|
|
|
|
options.target_file_size_base = 32 << 10; // 32KB
|
|
|
|
options.level0_file_num_compaction_trigger = kNumFilesTrigger;
|
|
|
|
// Trigger compaction if size amplification exceeds 110%
|
|
|
|
options.compaction_options_universal.max_size_amplification_percent = 110;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
int num_bottom_pri_compactions = 0;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BGWorkBottomCompaction",
|
|
|
|
[&](void* /*arg*/) { ++num_bottom_pri_compactions; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int num = 0; num < kNumFilesTrigger; num++) {
|
|
|
|
ASSERT_EQ(NumSortedRuns(), num);
|
|
|
|
int key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_EQ(1, num_bottom_pri_compactions);
|
|
|
|
|
|
|
|
// Verify that size amplification did occur
|
|
|
|
ASSERT_EQ(NumSortedRuns(), 1);
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) {
|
|
|
|
// Deletions can be dropped when compacted to non-last level if they fall
|
|
|
|
// outside the lower-level files' key-ranges.
|
|
|
|
const int kNumL0Files = 4;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0Files;
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// put key 1 and 3 in separate L1, L2 files.
|
|
|
|
// So key 0, 2, and 4+ fall outside these levels' key-ranges.
|
|
|
|
for (int level = 2; level >= 1; --level) {
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
ASSERT_OK(Put(Key(2 * i + 1), "val"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(level);
|
|
|
|
ASSERT_EQ(2, NumTableFilesAtLevel(level));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Delete keys in range [1, 4]. These L0 files will be compacted with L1:
|
|
|
|
// - Tombstones for keys 2 and 4 can be dropped early.
|
|
|
|
// - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges.
|
|
|
|
for (int i = 0; i < kNumL0Files; ++i) {
|
|
|
|
ASSERT_OK(Put(Key(0), "val")); // sentinel to prevent trivial move
|
|
|
|
ASSERT_OK(Delete(Key(i + 1)));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
for (int i = 0; i < kNumL0Files; ++i) {
|
|
|
|
std::string value;
|
|
|
|
ASSERT_TRUE(db_->Get(ReadOptions(), Key(i + 1), &value).IsNotFound());
|
|
|
|
}
|
|
|
|
ASSERT_EQ(2, options.statistics->getTickerCount(
|
|
|
|
COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE));
|
|
|
|
ASSERT_EQ(2,
|
|
|
|
options.statistics->getTickerCount(COMPACTION_KEY_DROP_OBSOLETE));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) {
|
|
|
|
// https://www.facebook.com/groups/rocksdb.dev/permalink/1389452781153232/
|
|
|
|
// CompactFiles() had a bug where it failed to pick a compaction when an L0
|
|
|
|
// compaction existed, but marked it as scheduled anyways. It'd never be
|
|
|
|
// unmarked as scheduled, so future compactions or DB close could hang.
|
|
|
|
const int kNumL0Files = 5;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0Files - 1;
|
|
|
|
options.max_background_compactions = 2;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"LevelCompactionPicker::PickCompaction:Return",
|
|
|
|
"DBCompactionTest::CompactFilesPendingL0Bug:Picked"},
|
|
|
|
{"DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted",
|
|
|
|
"DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
auto schedule_multi_compaction_token =
|
|
|
|
dbfull()->TEST_write_controler().GetCompactionPressureToken();
|
|
|
|
|
|
|
|
// Files 0-3 will be included in an L0->L1 compaction.
|
|
|
|
//
|
|
|
|
// File 4 will be included in a call to CompactFiles() while the first
|
|
|
|
// compaction is running.
|
|
|
|
for (int i = 0; i < kNumL0Files - 1; ++i) {
|
|
|
|
ASSERT_OK(Put(Key(0), "val")); // sentinel to prevent trivial move
|
|
|
|
ASSERT_OK(Put(Key(i + 1), "val"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:Picked");
|
|
|
|
// file 4 flushed after 0-3 picked
|
|
|
|
ASSERT_OK(Put(Key(kNumL0Files), "val"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// previously DB close would hang forever as this situation caused scheduled
|
|
|
|
// compactions count to never decrement to zero.
|
|
|
|
ColumnFamilyMetaData cf_meta;
|
|
|
|
dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
|
|
|
|
ASSERT_EQ(kNumL0Files, cf_meta.levels[0].files.size());
|
|
|
|
std::vector<std::string> input_filenames;
|
|
|
|
input_filenames.push_back(cf_meta.levels[0].files.front().name);
|
|
|
|
ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames,
|
|
|
|
0 /* output_level */));
|
|
|
|
TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted");
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) {
|
|
|
|
// Regression test for bug of not pulling in L0 files that overlap the user-
|
|
|
|
// specified input files in time- and key-ranges.
|
|
|
|
ASSERT_OK(Put(Key(0), "old_val"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put(Key(0), "new_val"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ColumnFamilyMetaData cf_meta;
|
|
|
|
dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
|
|
|
|
ASSERT_GE(cf_meta.levels.size(), 2);
|
|
|
|
ASSERT_EQ(2, cf_meta.levels[0].files.size());
|
|
|
|
|
|
|
|
// Compacting {new L0 file, L1 file} should pull in the old L0 file since it
|
|
|
|
// overlaps in key-range and time-range.
|
|
|
|
std::vector<std::string> input_filenames;
|
|
|
|
input_filenames.push_back(cf_meta.levels[0].files.front().name);
|
|
|
|
ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames,
|
|
|
|
1 /* output_level */));
|
|
|
|
ASSERT_EQ("new_val", Get(Key(0)));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, DeleteFilesInRangeConflictWithCompaction) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
const Snapshot* snapshot = nullptr;
|
|
|
|
const int kMaxKey = 10;
|
|
|
|
|
|
|
|
for (int i = 0; i < kMaxKey; i++) {
|
|
|
|
ASSERT_OK(Put(Key(i), Key(i)));
|
|
|
|
ASSERT_OK(Delete(Key(i)));
|
|
|
|
if (!snapshot) {
|
|
|
|
snapshot = db_->GetSnapshot();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
ASSERT_OK(Put(Key(kMaxKey), Key(kMaxKey)));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
// test DeleteFilesInRange() deletes the files already picked for compaction
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"VersionSet::LogAndApply:WriteManifestStart",
|
|
|
|
"BackgroundCallCompaction:0"},
|
|
|
|
{"DBImpl::BackgroundCompaction:Finish",
|
|
|
|
"VersionSet::LogAndApply:WriteManifestDone"}});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// release snapshot which mark bottommost file for compaction
|
|
|
|
db_->ReleaseSnapshot(snapshot);
|
|
|
|
std::string begin_string = Key(0);
|
|
|
|
std::string end_string = Key(kMaxKey + 1);
|
|
|
|
Slice begin(begin_string);
|
|
|
|
Slice end(end_string);
|
|
|
|
ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
|
|
|
|
// bottom-level files may contain deletions due to snapshots protecting the
|
|
|
|
// deleted keys. Once the snapshot is released, we should see files with many
|
|
|
|
// such deletions undergo single-file compactions.
|
|
|
|
const int kNumKeysPerFile = 1024;
|
|
|
|
const int kNumLevelFiles = 4;
|
|
|
|
const int kValueSize = 128;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.level0_file_num_compaction_trigger = kNumLevelFiles;
|
|
|
|
// inflate it a bit to account for key/metadata overhead
|
|
|
|
options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
|
|
|
|
CreateAndReopenWithCF({"one"}, options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
const Snapshot* snapshot = nullptr;
|
|
|
|
for (int i = 0; i < kNumLevelFiles; ++i) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(
|
|
|
|
Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
|
|
|
|
}
|
|
|
|
if (i == kNumLevelFiles - 1) {
|
|
|
|
snapshot = db_->GetSnapshot();
|
|
|
|
// delete every other key after grabbing a snapshot, so these deletions
|
|
|
|
// and the keys they cover can't be dropped until after the snapshot is
|
|
|
|
// released.
|
|
|
|
for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) {
|
|
|
|
ASSERT_OK(Delete(Key(j)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
if (i < kNumLevelFiles - 1) {
|
|
|
|
ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
|
|
|
|
|
|
|
|
std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
|
|
|
|
db_->GetLiveFilesMetaData(&pre_release_metadata);
|
|
|
|
// just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
|
|
|
|
// files does not need to be preserved in case of a future snapshot.
|
|
|
|
ASSERT_OK(Put(Key(0), "val"));
|
|
|
|
ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
|
|
|
|
// release snapshot and wait for compactions to finish. Single-file
|
|
|
|
// compactions should be triggered, which reduce the size of each bottom-level
|
|
|
|
// file without changing file count.
|
|
|
|
db_->ReleaseSnapshot(snapshot);
|
|
|
|
ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
ASSERT_TRUE(compaction->compaction_reason() ==
|
|
|
|
CompactionReason::kBottommostFiles);
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
db_->GetLiveFilesMetaData(&post_release_metadata);
|
|
|
|
ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
|
|
|
|
|
|
|
|
for (size_t i = 0; i < pre_release_metadata.size(); ++i) {
|
|
|
|
const auto& pre_file = pre_release_metadata[i];
|
|
|
|
const auto& post_file = post_release_metadata[i];
|
|
|
|
ASSERT_EQ(1, pre_file.level);
|
|
|
|
ASSERT_EQ(1, post_file.level);
|
|
|
|
// each file is smaller than it was before as it was rewritten without
|
|
|
|
// deletion markers/deleted keys.
|
|
|
|
ASSERT_LT(post_file.size, pre_file.size);
|
|
|
|
}
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, NoCompactBottomLevelFilesWithDeletions) {
|
|
|
|
// bottom-level files may contain deletions due to snapshots protecting the
|
|
|
|
// deleted keys. Once the snapshot is released, we should see files with many
|
|
|
|
// such deletions undergo single-file compactions. But when disabling auto
|
|
|
|
// compactions, it shouldn't be triggered which may causing too many
|
|
|
|
// background jobs.
|
|
|
|
const int kNumKeysPerFile = 1024;
|
|
|
|
const int kNumLevelFiles = 4;
|
|
|
|
const int kValueSize = 128;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.level0_file_num_compaction_trigger = kNumLevelFiles;
|
|
|
|
// inflate it a bit to account for key/metadata overhead
|
|
|
|
options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
const Snapshot* snapshot = nullptr;
|
|
|
|
for (int i = 0; i < kNumLevelFiles; ++i) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(
|
|
|
|
Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
|
|
|
|
}
|
|
|
|
if (i == kNumLevelFiles - 1) {
|
|
|
|
snapshot = db_->GetSnapshot();
|
|
|
|
// delete every other key after grabbing a snapshot, so these deletions
|
|
|
|
// and the keys they cover can't be dropped until after the snapshot is
|
|
|
|
// released.
|
|
|
|
for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) {
|
|
|
|
ASSERT_OK(Delete(Key(j)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
if (i < kNumLevelFiles - 1) {
|
|
|
|
ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr));
|
|
|
|
ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
|
|
|
|
|
|
|
|
std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
|
|
|
|
db_->GetLiveFilesMetaData(&pre_release_metadata);
|
|
|
|
// just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
|
|
|
|
// files does not need to be preserved in case of a future snapshot.
|
|
|
|
ASSERT_OK(Put(Key(0), "val"));
|
|
|
|
|
|
|
|
// release snapshot and no compaction should be triggered.
|
|
|
|
std::atomic<int> num_compactions{0};
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:Start",
|
|
|
|
[&](void* /*arg*/) { num_compactions.fetch_add(1); });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
db_->ReleaseSnapshot(snapshot);
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ(0, num_compactions);
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
|
|
|
|
db_->GetLiveFilesMetaData(&post_release_metadata);
|
|
|
|
ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
|
|
|
|
for (size_t i = 0; i < pre_release_metadata.size(); ++i) {
|
|
|
|
const auto& pre_file = pre_release_metadata[i];
|
|
|
|
const auto& post_file = post_release_metadata[i];
|
|
|
|
ASSERT_EQ(1, pre_file.level);
|
|
|
|
ASSERT_EQ(1, post_file.level);
|
|
|
|
// each file is same as before with deletion markers/deleted keys.
|
|
|
|
ASSERT_EQ(post_file.size, pre_file.size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, RoundRobinTtlCompactionNormal) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.level0_file_num_compaction_trigger = 20;
|
|
|
|
options.ttl = 24 * 60 * 60; // 24 hours
|
|
|
|
options.compaction_pri = kRoundRobin;
|
|
|
|
env_->now_cpu_count_.store(0);
|
|
|
|
env_->SetMockSleep();
|
|
|
|
options.env = env_;
|
|
|
|
|
|
|
|
// add a small second for each wait time, to make sure the file is expired
|
|
|
|
int small_seconds = 1;
|
|
|
|
|
|
|
|
std::atomic_int ttl_compactions{0};
|
|
|
|
std::atomic_int round_robin_ttl_compactions{0};
|
|
|
|
std::atomic_int other_compactions{0};
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
auto compaction_reason = compaction->compaction_reason();
|
|
|
|
if (compaction_reason == CompactionReason::kTtl) {
|
|
|
|
ttl_compactions++;
|
|
|
|
} else if (compaction_reason == CompactionReason::kRoundRobinTtl) {
|
|
|
|
round_robin_ttl_compactions++;
|
|
|
|
} else {
|
|
|
|
other_compactions++;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Setup the files from lower level to up level, each file is 1 hour's older
|
|
|
|
// than the next one.
|
|
|
|
// create 10 files on the last level (L6)
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
for (int j = 0; j < 100; j++) {
|
|
|
|
ASSERT_OK(Put(Key(i * 100 + j), "value" + std::to_string(i * 100 + j)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
env_->MockSleepForSeconds(60 * 60); // generate 1 file per hour
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(6);
|
|
|
|
|
|
|
|
// create 5 files on L5
|
|
|
|
for (int i = 0; i < 5; i++) {
|
|
|
|
for (int j = 0; j < 200; j++) {
|
|
|
|
ASSERT_OK(Put(Key(i * 200 + j), "value" + std::to_string(i * 200 + j)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
env_->MockSleepForSeconds(60 * 60);
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(5);
|
|
|
|
|
|
|
|
// create 3 files on L4
|
|
|
|
for (int i = 0; i < 3; i++) {
|
|
|
|
for (int j = 0; j < 300; j++) {
|
|
|
|
ASSERT_OK(Put(Key(i * 300 + j), "value" + std::to_string(i * 300 + j)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
env_->MockSleepForSeconds(60 * 60);
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(4);
|
|
|
|
|
|
|
|
// The LSM tree should be like:
|
|
|
|
// L4: [0, 299], [300, 599], [600, 899]
|
|
|
|
// L5: [0, 199] [200, 399]...............[800, 999]
|
|
|
|
// L6: [0,99][100,199][200,299][300,399]...............[800,899][900,999]
|
|
|
|
ASSERT_EQ("0,0,0,0,3,5,10", FilesPerLevel());
|
|
|
|
|
|
|
|
// make sure the first L5 file is expired
|
|
|
|
env_->MockSleepForSeconds(16 * 60 * 60 + small_seconds++);
|
|
|
|
|
|
|
|
// trigger TTL compaction
|
|
|
|
ASSERT_OK(Put(Key(4), "value" + std::to_string(1)));
|
|
|
|
ASSERT_OK(Put(Key(5), "value" + std::to_string(1)));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
// verify there's a RoundRobin TTL compaction
|
|
|
|
ASSERT_EQ(1, round_robin_ttl_compactions);
|
|
|
|
round_robin_ttl_compactions = 0;
|
|
|
|
|
|
|
|
// expire 2 more files
|
|
|
|
env_->MockSleepForSeconds(2 * 60 * 60 + small_seconds++);
|
|
|
|
// trigger TTL compaction
|
|
|
|
ASSERT_OK(Put(Key(4), "value" + std::to_string(2)));
|
|
|
|
ASSERT_OK(Put(Key(5), "value" + std::to_string(2)));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_EQ(2, round_robin_ttl_compactions);
|
|
|
|
round_robin_ttl_compactions = 0;
|
|
|
|
|
|
|
|
// expire 4 more files, 2 out of 3 files on L4 are expired
|
|
|
|
env_->MockSleepForSeconds(4 * 60 * 60 + small_seconds++);
|
|
|
|
// trigger TTL compaction
|
|
|
|
ASSERT_OK(Put(Key(6), "value" + std::to_string(3)));
|
|
|
|
ASSERT_OK(Put(Key(7), "value" + std::to_string(3)));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_EQ(1, NumTableFilesAtLevel(4));
|
|
|
|
ASSERT_EQ(0, NumTableFilesAtLevel(5));
|
|
|
|
|
|
|
|
ASSERT_GT(round_robin_ttl_compactions, 0);
|
|
|
|
round_robin_ttl_compactions = 0;
|
|
|
|
|
|
|
|
// make the first L0 file expired, which triggers a normal TTL compaction
|
|
|
|
// instead of roundrobin TTL compaction, it will also include an extra file
|
|
|
|
// from L0 because of overlap
|
|
|
|
ASSERT_EQ(0, ttl_compactions);
|
|
|
|
env_->MockSleepForSeconds(19 * 60 * 60 + small_seconds++);
|
|
|
|
|
|
|
|
// trigger TTL compaction
|
|
|
|
ASSERT_OK(Put(Key(6), "value" + std::to_string(4)));
|
|
|
|
ASSERT_OK(Put(Key(7), "value" + std::to_string(4)));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
// L0 -> L1 compaction is normal TTL compaction, L1 -> next levels compactions
|
|
|
|
// are RoundRobin TTL compaction.
|
|
|
|
ASSERT_GT(ttl_compactions, 0);
|
|
|
|
ttl_compactions = 0;
|
|
|
|
ASSERT_GT(round_robin_ttl_compactions, 0);
|
|
|
|
round_robin_ttl_compactions = 0;
|
|
|
|
|
|
|
|
// All files are expired, so only the last level has data
|
|
|
|
env_->MockSleepForSeconds(24 * 60 * 60);
|
|
|
|
// trigger TTL compaction
|
|
|
|
ASSERT_OK(Put(Key(6), "value" + std::to_string(4)));
|
|
|
|
ASSERT_OK(Put(Key(7), "value" + std::to_string(4)));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
|
|
|
|
|
|
|
|
ASSERT_GT(ttl_compactions, 0);
|
|
|
|
ttl_compactions = 0;
|
|
|
|
ASSERT_GT(round_robin_ttl_compactions, 0);
|
|
|
|
round_robin_ttl_compactions = 0;
|
|
|
|
|
|
|
|
ASSERT_EQ(0, other_compactions);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, RoundRobinTtlCompactionUnsortedTime) {
|
|
|
|
// This is to test the case that the RoundRobin compaction cursor not pointing
|
|
|
|
// to the oldest file, RoundRobin compaction should still compact the file
|
|
|
|
// after cursor until all expired files are compacted.
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.level0_file_num_compaction_trigger = 20;
|
|
|
|
options.ttl = 24 * 60 * 60; // 24 hours
|
|
|
|
options.compaction_pri = kRoundRobin;
|
|
|
|
env_->now_cpu_count_.store(0);
|
|
|
|
env_->SetMockSleep();
|
|
|
|
options.env = env_;
|
|
|
|
|
|
|
|
std::atomic_int ttl_compactions{0};
|
|
|
|
std::atomic_int round_robin_ttl_compactions{0};
|
|
|
|
std::atomic_int other_compactions{0};
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
auto compaction_reason = compaction->compaction_reason();
|
|
|
|
if (compaction_reason == CompactionReason::kTtl) {
|
|
|
|
ttl_compactions++;
|
|
|
|
} else if (compaction_reason == CompactionReason::kRoundRobinTtl) {
|
|
|
|
round_robin_ttl_compactions++;
|
|
|
|
} else {
|
|
|
|
other_compactions++;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// create 10 files on the last level (L6)
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
for (int j = 0; j < 100; j++) {
|
|
|
|
ASSERT_OK(Put(Key(i * 100 + j), "value" + std::to_string(i * 100 + j)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
env_->MockSleepForSeconds(60 * 60); // generate 1 file per hour
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(6);
|
|
|
|
|
|
|
|
// create 5 files on L5
|
|
|
|
for (int i = 0; i < 5; i++) {
|
|
|
|
for (int j = 0; j < 200; j++) {
|
|
|
|
ASSERT_OK(Put(Key(i * 200 + j), "value" + std::to_string(i * 200 + j)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
env_->MockSleepForSeconds(60 * 60); // 1 hour
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(5);
|
|
|
|
|
|
|
|
// The LSM tree should be like:
|
|
|
|
// L5: [0, 199] [200, 399] [400,599] [600,799] [800, 999]
|
|
|
|
// L6: [0,99][100,199][200,299][300,399]....................[800,899][900,999]
|
|
|
|
ASSERT_EQ("0,0,0,0,0,5,10", FilesPerLevel());
|
|
|
|
|
|
|
|
// point the compaction cursor to the 4th file on L5
|
|
|
|
VersionSet* const versions = dbfull()->GetVersionSet();
|
|
|
|
assert(versions);
|
|
|
|
ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
|
|
|
|
ASSERT_NE(cfd, nullptr);
|
|
|
|
Version* const current = cfd->current();
|
|
|
|
ASSERT_NE(current, nullptr);
|
|
|
|
VersionStorageInfo* storage_info = current->storage_info();
|
|
|
|
ASSERT_NE(storage_info, nullptr);
|
|
|
|
const InternalKey split_cursor = InternalKey(Key(600), 100000, kTypeValue);
|
|
|
|
storage_info->AddCursorForOneLevel(5, split_cursor);
|
|
|
|
|
|
|
|
// make the first file on L5 expired, there should be 3 TTL compactions:
|
|
|
|
// 4th one, 5th one, then 1st one.
|
|
|
|
env_->MockSleepForSeconds(19 * 60 * 60 + 1);
|
|
|
|
// trigger TTL compaction
|
|
|
|
ASSERT_OK(Put(Key(6), "value" + std::to_string(4)));
|
|
|
|
ASSERT_OK(Put(Key(7), "value" + std::to_string(4)));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ(2, NumTableFilesAtLevel(5));
|
|
|
|
|
|
|
|
ASSERT_EQ(3, round_robin_ttl_compactions);
|
|
|
|
ASSERT_EQ(0, ttl_compactions);
|
|
|
|
ASSERT_EQ(0, other_compactions);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
|
|
|
|
const int kNumKeysPerFile = 32;
|
|
|
|
const int kNumLevelFiles = 2;
|
|
|
|
const int kValueSize = 1024;
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.ttl = 24 * 60 * 60; // 24 hours
|
|
|
|
options.max_open_files = -1;
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->SetMockSleep();
|
|
|
|
options.env = env_;
|
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
// NOTE: Presumed unnecessary and removed: resetting mock time in env
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int i = 0; i < kNumLevelFiles; ++i) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(
|
|
|
|
Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
MoveFilesToLevel(3);
|
|
|
|
ASSERT_EQ("0,0,0,2", FilesPerLevel());
|
|
|
|
|
|
|
|
// Delete previously written keys.
|
|
|
|
for (int i = 0; i < kNumLevelFiles; ++i) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ("2,0,0,2", FilesPerLevel());
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
ASSERT_EQ("0,2,0,2", FilesPerLevel());
|
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(36 * 60 * 60); // 36 hours
|
|
|
|
ASSERT_EQ("0,2,0,2", FilesPerLevel());
|
|
|
|
|
|
|
|
// Just do a simple write + flush so that the Ttl expired files get
|
|
|
|
// compacted.
|
|
|
|
ASSERT_OK(Put("a", "1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
// All non-L0 files are deleted, as they contained only deleted data.
|
|
|
|
ASSERT_EQ("1", FilesPerLevel());
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
|
|
|
|
// Test dynamically changing ttl.
|
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
// NOTE: Presumed unnecessary and removed: resetting mock time in env
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
for (int i = 0; i < kNumLevelFiles; ++i) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(
|
|
|
|
Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
MoveFilesToLevel(3);
|
|
|
|
ASSERT_EQ("0,0,0,2", FilesPerLevel());
|
|
|
|
|
|
|
|
// Delete previously written keys.
|
|
|
|
for (int i = 0; i < kNumLevelFiles; ++i) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ("2,0,0,2", FilesPerLevel());
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
ASSERT_EQ("0,2,0,2", FilesPerLevel());
|
|
|
|
|
|
|
|
// Move time forward by 12 hours, and make sure that compaction still doesn't
|
|
|
|
// trigger as ttl is set to 24 hours.
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(12 * 60 * 60);
|
|
|
|
ASSERT_OK(Put("a", "1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ("1,2,0,2", FilesPerLevel());
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// Dynamically change ttl to 10 hours.
|
|
|
|
// This should trigger a ttl compaction, as 12 hours have already passed.
|
|
|
|
ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}}));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
// All non-L0 files are deleted, as they contained only deleted data.
|
|
|
|
ASSERT_EQ("1", FilesPerLevel());
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->SetMockSleep();
|
|
|
|
const int kValueSize = 100;
|
|
|
|
|
|
|
|
for (bool if_restart : {false, true}) {
|
|
|
|
for (bool if_open_all_files : {false, true}) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.ttl = 24 * 60 * 60; // 24 hours
|
|
|
|
if (if_open_all_files) {
|
|
|
|
options.max_open_files = -1;
|
|
|
|
} else {
|
|
|
|
options.max_open_files = 20;
|
|
|
|
}
|
|
|
|
// RocksDB sanitize max open files to at least 20. Modify it back.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
|
|
|
|
int* max_open_files = static_cast<int*>(arg);
|
|
|
|
*max_open_files = 2;
|
|
|
|
});
|
|
|
|
// In the case where all files are opened and doing DB restart
|
|
|
|
// forcing the oldest ancester time in manifest file to be 0 to
|
|
|
|
// simulate the case of reading from an old version.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionEdit::EncodeTo:VarintOldestAncesterTime", [&](void* arg) {
|
|
|
|
if (if_restart && if_open_all_files) {
|
|
|
|
std::string* encoded_fieled = static_cast<std::string*>(arg);
|
|
|
|
*encoded_fieled = "";
|
|
|
|
PutVarint64(encoded_fieled, 0);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
options.env = env_;
|
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
// NOTE: Presumed unnecessary and removed: resetting mock time in env
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
int ttl_compactions = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
auto compaction_reason = compaction->compaction_reason();
|
|
|
|
if (compaction_reason == CompactionReason::kTtl) {
|
|
|
|
ttl_compactions++;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// Add two L6 files with key ranges: [1 .. 100], [101 .. 200].
|
|
|
|
Random rnd(301);
|
|
|
|
for (int i = 1; i <= 100; ++i) {
|
|
|
|
ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
// Get the first file's creation time. This will be the oldest file in the
|
|
|
|
// DB. Compactions inolving this file's descendents should keep getting
|
|
|
|
// this time.
|
|
|
|
std::vector<std::vector<FileMetaData>> level_to_files;
|
|
|
|
dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
|
|
|
|
&level_to_files);
|
|
|
|
uint64_t oldest_time = level_to_files[0][0].oldest_ancester_time;
|
|
|
|
// Add 1 hour and do another flush.
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(1 * 60 * 60);
|
|
|
|
for (int i = 101; i <= 200; ++i) {
|
|
|
|
ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
MoveFilesToLevel(6);
|
|
|
|
ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
|
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(1 * 60 * 60);
|
|
|
|
// Add two L4 files with key ranges: [1 .. 50], [51 .. 150].
|
|
|
|
for (int i = 1; i <= 50; ++i) {
|
|
|
|
ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(1 * 60 * 60);
|
|
|
|
for (int i = 51; i <= 150; ++i) {
|
|
|
|
ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
MoveFilesToLevel(4);
|
|
|
|
ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel());
|
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(1 * 60 * 60);
|
|
|
|
// Add one L1 file with key range: [26, 75].
|
|
|
|
for (int i = 26; i <= 75; ++i) {
|
|
|
|
ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel());
|
|
|
|
|
|
|
|
// LSM tree:
|
|
|
|
// L1: [26 .. 75]
|
|
|
|
// L4: [1 .. 50][51 ..... 150]
|
|
|
|
// L6: [1 ........ 100][101 .... 200]
|
|
|
|
//
|
|
|
|
// On TTL expiry, TTL compaction should be initiated on L1 file, and the
|
|
|
|
// compactions should keep going on until the key range hits bottom level.
|
|
|
|
// In other words: the compaction on this data range "cascasdes" until
|
|
|
|
// reaching the bottom level.
|
|
|
|
//
|
|
|
|
// Order of events on TTL expiry:
|
|
|
|
// 1. L1 file falls to L3 via 2 trivial moves which are initiated by the
|
|
|
|
// ttl
|
|
|
|
// compaction.
|
|
|
|
// 2. A TTL compaction happens between L3 and L4 files. Output file in L4.
|
|
|
|
// 3. The new output file from L4 falls to L5 via 1 trival move initiated
|
|
|
|
// by the ttl compaction.
|
|
|
|
// 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6.
|
|
|
|
|
|
|
|
// Add 25 hours and do a write
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(25 * 60 * 60);
|
|
|
|
|
|
|
|
ASSERT_OK(Put(Key(1), "1"));
|
|
|
|
if (if_restart) {
|
|
|
|
Reopen(options);
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
|
|
|
|
ASSERT_EQ(5, ttl_compactions);
|
|
|
|
|
|
|
|
dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
|
|
|
|
&level_to_files);
|
|
|
|
ASSERT_EQ(oldest_time, level_to_files[6][0].oldest_ancester_time);
|
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(25 * 60 * 60);
|
|
|
|
ASSERT_OK(Put(Key(2), "1"));
|
|
|
|
if (if_restart) {
|
|
|
|
Reopen(options);
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
|
|
|
|
ASSERT_GE(ttl_compactions, 6);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->SetMockSleep();
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
const int kNumKeysPerFile = 32;
|
|
|
|
const int kNumLevelFiles = 2;
|
|
|
|
const int kValueSize = 100;
|
|
|
|
|
|
|
|
for (bool if_restart : {false, true}) {
|
|
|
|
for (bool if_open_all_files : {false, true}) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days
|
|
|
|
if (if_open_all_files) {
|
|
|
|
options.max_open_files = -1; // needed for ttl compaction
|
|
|
|
} else {
|
|
|
|
options.max_open_files = 20;
|
|
|
|
}
|
|
|
|
// RocksDB sanitize max open files to at least 20. Modify it back.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
|
|
|
|
int* max_open_files = static_cast<int*>(arg);
|
|
|
|
*max_open_files = 0;
|
|
|
|
});
|
|
|
|
// In the case where all files are opened and doing DB restart
|
|
|
|
// forcing the file creation time in manifest file to be 0 to
|
|
|
|
// simulate the case of reading from an old version.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) {
|
|
|
|
if (if_restart && if_open_all_files) {
|
|
|
|
std::string* encoded_fieled = static_cast<std::string*>(arg);
|
|
|
|
*encoded_fieled = "";
|
|
|
|
PutVarint64(encoded_fieled, 0);
|
|
|
|
}
|
|
|
|
});
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
|
|
|
|
options.env = env_;
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
// NOTE: Presumed unnecessary and removed: resetting mock time in env
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
|
|
|
|
int periodic_compactions = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
auto compaction_reason = compaction->compaction_reason();
|
|
|
|
if (compaction_reason == CompactionReason::kPeriodicCompaction) {
|
|
|
|
periodic_compactions++;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int i = 0; i < kNumLevelFiles; ++i) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(
|
|
|
|
Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
|
|
|
|
ASSERT_EQ("2", FilesPerLevel());
|
|
|
|
ASSERT_EQ(0, periodic_compactions);
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
|
|
|
|
// Add 50 hours and do a write
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(50 * 60 * 60);
|
|
|
|
ASSERT_OK(Put("a", "1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
// Assert that the files stay in the same level
|
|
|
|
ASSERT_EQ("3", FilesPerLevel());
|
|
|
|
// The two old files go through the periodic compaction process
|
|
|
|
ASSERT_EQ(2, periodic_compactions);
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
ASSERT_EQ("0,3", FilesPerLevel());
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
|
|
|
|
// Add another 50 hours and do another write
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(50 * 60 * 60);
|
|
|
|
ASSERT_OK(Put("b", "2"));
|
|
|
|
if (if_restart) {
|
|
|
|
Reopen(options);
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ("1,3", FilesPerLevel());
|
|
|
|
// The three old files now go through the periodic compaction process. 2
|
|
|
|
// + 3.
|
|
|
|
ASSERT_EQ(5, periodic_compactions);
|
|
|
|
|
|
|
|
// Add another 50 hours and do another write
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(50 * 60 * 60);
|
|
|
|
ASSERT_OK(Put("c", "3"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ("2,3", FilesPerLevel());
|
|
|
|
// The four old files now go through the periodic compaction process. 5
|
|
|
|
// + 4.
|
|
|
|
ASSERT_EQ(9, periodic_compactions);
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) {
|
|
|
|
// This test makes sure that periodic compactions are working with a DB
|
|
|
|
// where file_creation_time of some files is 0.
|
|
|
|
// After compactions the new files are created with a valid file_creation_time
|
|
|
|
|
|
|
|
const int kNumKeysPerFile = 32;
|
|
|
|
const int kNumFiles = 4;
|
|
|
|
const int kValueSize = 100;
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->SetMockSleep();
|
|
|
|
options.env = env_;
|
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
// NOTE: Presumed unnecessary and removed: resetting mock time in env
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
int periodic_compactions = 0;
|
|
|
|
bool set_file_creation_time_to_zero = true;
|
|
|
|
bool set_creation_time_to_zero = true;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
auto compaction_reason = compaction->compaction_reason();
|
|
|
|
if (compaction_reason == CompactionReason::kPeriodicCompaction) {
|
|
|
|
periodic_compactions++;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
|
|
|
|
TableProperties* props = reinterpret_cast<TableProperties*>(arg);
|
|
|
|
if (set_file_creation_time_to_zero) {
|
|
|
|
props->file_creation_time = 0;
|
|
|
|
}
|
|
|
|
if (set_creation_time_to_zero) {
|
|
|
|
props->creation_time = 0;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int i = 0; i < kNumFiles; ++i) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(
|
|
|
|
Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
// Move the first two files to L2.
|
|
|
|
if (i == 1) {
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
set_creation_time_to_zero = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_EQ("2,0,2", FilesPerLevel());
|
|
|
|
ASSERT_EQ(0, periodic_compactions);
|
|
|
|
|
|
|
|
Close();
|
|
|
|
|
|
|
|
set_file_creation_time_to_zero = false;
|
|
|
|
// Forward the clock by 2 days.
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(2 * 24 * 60 * 60);
|
|
|
|
options.periodic_compaction_seconds = 1 * 24 * 60 * 60; // 1 day
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ("2,0,2", FilesPerLevel());
|
|
|
|
// Make sure that all files go through periodic compaction.
|
|
|
|
ASSERT_EQ(kNumFiles, periodic_compactions);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) {
|
|
|
|
const int kNumKeysPerFile = 32;
|
|
|
|
const int kNumLevelFiles = 2;
|
|
|
|
const int kValueSize = 100;
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.ttl = 10 * 60 * 60; // 10 hours
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days
|
|
|
|
options.max_open_files = -1; // needed for both periodic and ttl compactions
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->SetMockSleep();
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
options.env = env_;
|
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
// NOTE: Presumed unnecessary and removed: resetting mock time in env
|
|
|
|
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
int periodic_compactions = 0;
|
|
|
|
int ttl_compactions = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
auto compaction_reason = compaction->compaction_reason();
|
|
|
|
if (compaction_reason == CompactionReason::kPeriodicCompaction) {
|
|
|
|
periodic_compactions++;
|
|
|
|
} else if (compaction_reason == CompactionReason::kTtl) {
|
|
|
|
ttl_compactions++;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int i = 0; i < kNumLevelFiles; ++i) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(
|
|
|
|
Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
|
|
|
|
MoveFilesToLevel(3);
|
|
|
|
|
|
|
|
ASSERT_EQ("0,0,0,2", FilesPerLevel());
|
|
|
|
ASSERT_EQ(0, periodic_compactions);
|
|
|
|
ASSERT_EQ(0, ttl_compactions);
|
|
|
|
|
|
|
|
// Add some time greater than periodic_compaction_time.
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(50 * 60 * 60);
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
ASSERT_OK(Put("a", "1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
// Files in the bottom level go through periodic compactions.
|
|
|
|
ASSERT_EQ("1,0,0,2", FilesPerLevel());
|
|
|
|
ASSERT_EQ(2, periodic_compactions);
|
|
|
|
ASSERT_EQ(0, ttl_compactions);
|
|
|
|
|
|
|
|
// Add a little more time than ttl
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(11 * 60 * 60);
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
ASSERT_OK(Put("b", "1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
// Notice that the previous file in level 1 falls down to the bottom level
|
|
|
|
// due to ttl compactions, one level at a time.
|
|
|
|
// And bottom level files don't get picked up for ttl compactions.
|
|
|
|
ASSERT_EQ("1,0,0,3", FilesPerLevel());
|
|
|
|
ASSERT_EQ(2, periodic_compactions);
|
|
|
|
ASSERT_EQ(3, ttl_compactions);
|
|
|
|
|
|
|
|
// Add some time greater than periodic_compaction_time.
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(50 * 60 * 60);
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
ASSERT_OK(Put("c", "1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
// Previous L0 file falls one level at a time to bottom level due to ttl.
|
|
|
|
// And all 4 bottom files go through periodic compactions.
|
|
|
|
ASSERT_EQ("1,0,0,4", FilesPerLevel());
|
|
|
|
ASSERT_EQ(6, periodic_compactions);
|
|
|
|
ASSERT_EQ(6, ttl_compactions);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
}
|
|
|
|
|
Try to start TTL earlier with kMinOverlappingRatio is used (#8749)
Summary:
Right now, when options.ttl is set, compactions are triggered around the time when TTL is reached. This might cause extra compactions which are often bursty. This commit tries to mitigate it by picking those files earlier in normal compaction picking process. This is only implemented using kMinOverlappingRatio with Leveled compaction as it is the default value and it is more complicated to change other styles.
When a file is aged more than ttl/2, RocksDB starts to boost the compaction priority of files in normal compaction picking process, and hope by the time TTL is reached, very few extra compaction is needed.
In order for this to work, another change is made: during a compaction, if an output level file is older than ttl/2, cut output files based on original boundary (if it is not in the last level). This is to make sure that after an old file is moved to the next level, and new data is merged from the upper level, the new data falling into this range isn't reset with old timestamp. Without this change, in many cases, most files from one level will keep having old timestamp, even if they have newer data and we stuck in it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8749
Test Plan: Add a unit test to test the boosting logic. Will add a unit test to test it end-to-end.
Reviewed By: jay-zhuang
Differential Revision: D30735261
fbshipit-source-id: 503c2d89250b22911eb99e72b379be154de3428e
3 years ago
|
|
|
TEST_F(DBCompactionTest, LevelTtlBooster) {
|
|
|
|
const int kNumKeysPerFile = 32;
|
|
|
|
const int kNumLevelFiles = 3;
|
|
|
|
const int kValueSize = 1000;
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.ttl = 10 * 60 * 60; // 10 hours
|
|
|
|
options.periodic_compaction_seconds = 480 * 60 * 60; // very long
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.max_bytes_for_level_base = 5 * uint64_t{kNumKeysPerFile * kValueSize};
|
|
|
|
options.max_open_files = -1; // needed for both periodic and ttl compactions
|
|
|
|
options.compaction_pri = CompactionPri::kMinOverlappingRatio;
|
|
|
|
env_->SetMockSleep();
|
|
|
|
options.env = env_;
|
|
|
|
|
|
|
|
// NOTE: Presumed unnecessary and removed: resetting mock time in env
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int i = 0; i < kNumLevelFiles; ++i) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(
|
|
|
|
Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
|
|
|
|
ASSERT_EQ("0,0,3", FilesPerLevel());
|
|
|
|
|
|
|
|
// Create some files for L1
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(Put(Key(2 * j + i), rnd.RandomString(kValueSize)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_EQ("0,1,3", FilesPerLevel());
|
|
|
|
|
|
|
|
// Make the new L0 files qualify TTL boosting and generate one more to trigger
|
|
|
|
// L1 -> L2 compaction. Old files will be picked even if their priority is
|
|
|
|
// lower without boosting.
|
|
|
|
env_->MockSleepForSeconds(8 * 60 * 60);
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(Put(Key(kNumKeysPerFile * 2 + 2 * j + i),
|
|
|
|
rnd.RandomString(kValueSize * 2)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
}
|
|
|
|
// Force files to be compacted to L1
|
|
|
|
ASSERT_OK(
|
|
|
|
dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "1"}}));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
Try to start TTL earlier with kMinOverlappingRatio is used (#8749)
Summary:
Right now, when options.ttl is set, compactions are triggered around the time when TTL is reached. This might cause extra compactions which are often bursty. This commit tries to mitigate it by picking those files earlier in normal compaction picking process. This is only implemented using kMinOverlappingRatio with Leveled compaction as it is the default value and it is more complicated to change other styles.
When a file is aged more than ttl/2, RocksDB starts to boost the compaction priority of files in normal compaction picking process, and hope by the time TTL is reached, very few extra compaction is needed.
In order for this to work, another change is made: during a compaction, if an output level file is older than ttl/2, cut output files based on original boundary (if it is not in the last level). This is to make sure that after an old file is moved to the next level, and new data is merged from the upper level, the new data falling into this range isn't reset with old timestamp. Without this change, in many cases, most files from one level will keep having old timestamp, even if they have newer data and we stuck in it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8749
Test Plan: Add a unit test to test the boosting logic. Will add a unit test to test it end-to-end.
Reviewed By: jay-zhuang
Differential Revision: D30735261
fbshipit-source-id: 503c2d89250b22911eb99e72b379be154de3428e
3 years ago
|
|
|
ASSERT_EQ("0,1,2", FilesPerLevel());
|
|
|
|
ASSERT_OK(
|
|
|
|
dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}}));
|
Try to start TTL earlier with kMinOverlappingRatio is used (#8749)
Summary:
Right now, when options.ttl is set, compactions are triggered around the time when TTL is reached. This might cause extra compactions which are often bursty. This commit tries to mitigate it by picking those files earlier in normal compaction picking process. This is only implemented using kMinOverlappingRatio with Leveled compaction as it is the default value and it is more complicated to change other styles.
When a file is aged more than ttl/2, RocksDB starts to boost the compaction priority of files in normal compaction picking process, and hope by the time TTL is reached, very few extra compaction is needed.
In order for this to work, another change is made: during a compaction, if an output level file is older than ttl/2, cut output files based on original boundary (if it is not in the last level). This is to make sure that after an old file is moved to the next level, and new data is merged from the upper level, the new data falling into this range isn't reset with old timestamp. Without this change, in many cases, most files from one level will keep having old timestamp, even if they have newer data and we stuck in it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8749
Test Plan: Add a unit test to test the boosting logic. Will add a unit test to test it end-to-end.
Reviewed By: jay-zhuang
Differential Revision: D30735261
fbshipit-source-id: 503c2d89250b22911eb99e72b379be154de3428e
3 years ago
|
|
|
|
|
|
|
ASSERT_GT(SizeAtLevel(1), kNumKeysPerFile * 4 * kValueSize);
|
|
|
|
}
|
|
|
|
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) {
|
|
|
|
class TestCompactionFilter : public CompactionFilter {
|
|
|
|
const char* Name() const override { return "TestCompactionFilter"; }
|
|
|
|
};
|
|
|
|
class TestCompactionFilterFactory : public CompactionFilterFactory {
|
|
|
|
const char* Name() const override { return "TestCompactionFilterFactory"; }
|
|
|
|
std::unique_ptr<CompactionFilter> CreateCompactionFilter(
|
|
|
|
const CompactionFilter::Context& /*context*/) override {
|
|
|
|
return std::unique_ptr<CompactionFilter>(new TestCompactionFilter());
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
const int kNumKeysPerFile = 32;
|
|
|
|
const int kNumLevelFiles = 2;
|
|
|
|
const int kValueSize = 100;
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
TestCompactionFilter test_compaction_filter;
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->SetMockSleep();
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
options.env = env_;
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
|
|
|
|
// NOTE: Presumed unnecessary and removed: resetting mock time in env
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
|
|
|
|
enum CompactionFilterType {
|
|
|
|
kUseCompactionFilter,
|
|
|
|
kUseCompactionFilterFactory
|
|
|
|
};
|
|
|
|
|
|
|
|
for (CompactionFilterType comp_filter_type :
|
|
|
|
{kUseCompactionFilter, kUseCompactionFilterFactory}) {
|
|
|
|
// Assert that periodic compactions are not enabled.
|
|
|
|
ASSERT_EQ(std::numeric_limits<uint64_t>::max() - 1,
|
|
|
|
options.periodic_compaction_seconds);
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
|
|
|
|
if (comp_filter_type == kUseCompactionFilter) {
|
|
|
|
options.compaction_filter = &test_compaction_filter;
|
|
|
|
options.compaction_filter_factory.reset();
|
|
|
|
} else if (comp_filter_type == kUseCompactionFilterFactory) {
|
|
|
|
options.compaction_filter = nullptr;
|
|
|
|
options.compaction_filter_factory.reset(
|
|
|
|
new TestCompactionFilterFactory());
|
|
|
|
}
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// periodic_compaction_seconds should be set to the sanitized value when
|
|
|
|
// a compaction filter or a compaction filter factory is used.
|
|
|
|
ASSERT_EQ(30 * 24 * 60 * 60,
|
|
|
|
dbfull()->GetOptions().periodic_compaction_seconds);
|
|
|
|
|
|
|
|
int periodic_compactions = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
auto compaction_reason = compaction->compaction_reason();
|
|
|
|
if (compaction_reason == CompactionReason::kPeriodicCompaction) {
|
|
|
|
periodic_compactions++;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
|
|
|
|
for (int i = 0; i < kNumLevelFiles; ++i) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(
|
|
|
|
Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
|
|
|
|
ASSERT_EQ("2", FilesPerLevel());
|
|
|
|
ASSERT_EQ(0, periodic_compactions);
|
|
|
|
|
|
|
|
// Add 31 days and do a write
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
4 years ago
|
|
|
env_->MockSleepForSeconds(31 * 24 * 60 * 60);
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
ASSERT_OK(Put("a", "1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
// Assert that the files stay in the same level
|
|
|
|
ASSERT_EQ("3", FilesPerLevel());
|
|
|
|
// The two old files go through the periodic compaction process
|
|
|
|
ASSERT_EQ(2, periodic_compactions);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
}
|
|
|
|
}
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
6 years ago
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) {
|
|
|
|
// Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
|
|
|
|
// compaction only triggers flush after it's sure stall won't be triggered for
|
|
|
|
// L0 file count going too high.
|
|
|
|
const int kNumL0FilesTrigger = 4;
|
|
|
|
const int kNumL0FilesLimit = 8;
|
|
|
|
// i == 0: verifies normal case where stall is avoided by delay
|
|
|
|
// i == 1: verifies no delay in edge case where stall trigger is same as
|
|
|
|
// compaction trigger, so stall can't be avoided
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
|
|
|
|
if (i == 0) {
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
|
|
|
|
} else {
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0FilesLimit;
|
|
|
|
}
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
if (i == 0) {
|
|
|
|
// ensure the auto compaction doesn't finish until manual compaction has
|
|
|
|
// had a chance to be delayed.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
|
|
|
|
"CompactionJob::Run():End"}});
|
|
|
|
} else {
|
|
|
|
// ensure the auto-compaction doesn't finish until manual compaction has
|
|
|
|
// continued without delay.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::FlushMemTable:StallWaitDone",
|
|
|
|
"CompactionJob::Run():End"}});
|
|
|
|
}
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
|
|
|
|
for (int k = 0; k < 2; ++k) {
|
|
|
|
ASSERT_OK(Put(Key(k), rnd.RandomString(1024)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
auto manual_compaction_thread = port::Thread([this]() {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.allow_write_stall = false;
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
});
|
|
|
|
|
|
|
|
manual_compaction_thread.join();
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ(0, NumTableFilesAtLevel(0));
|
|
|
|
ASSERT_GT(NumTableFilesAtLevel(1), 0);
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) {
|
|
|
|
// Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
|
|
|
|
// compaction only triggers flush after it's sure stall won't be triggered for
|
|
|
|
// immutable memtable count going too high.
|
|
|
|
const int kNumImmMemTableLimit = 8;
|
|
|
|
// i == 0: verifies normal case where stall is avoided by delay
|
|
|
|
// i == 1: verifies no delay in edge case where stall trigger is same as flush
|
|
|
|
// trigger, so stall can't be avoided
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
// the delay limit is one less than the stop limit. This test focuses on
|
|
|
|
// avoiding delay limit, but this option sets stop limit, so add one.
|
|
|
|
options.max_write_buffer_number = kNumImmMemTableLimit + 1;
|
|
|
|
if (i == 1) {
|
|
|
|
options.min_write_buffer_number_to_merge = kNumImmMemTableLimit;
|
|
|
|
}
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
if (i == 0) {
|
|
|
|
// ensure the flush doesn't finish until manual compaction has had a
|
|
|
|
// chance to be delayed.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
|
|
|
|
"FlushJob::WriteLevel0Table"}});
|
|
|
|
} else {
|
|
|
|
// ensure the flush doesn't finish until manual compaction has continued
|
|
|
|
// without delay.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::FlushMemTable:StallWaitDone",
|
|
|
|
"FlushJob::WriteLevel0Table"}});
|
|
|
|
}
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int j = 0; j < kNumImmMemTableLimit - 1; ++j) {
|
|
|
|
ASSERT_OK(Put(Key(0), rnd.RandomString(1024)));
|
|
|
|
FlushOptions flush_opts;
|
|
|
|
flush_opts.wait = false;
|
|
|
|
flush_opts.allow_write_stall = true;
|
|
|
|
ASSERT_OK(dbfull()->Flush(flush_opts));
|
|
|
|
}
|
|
|
|
|
|
|
|
auto manual_compaction_thread = port::Thread([this]() {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.allow_write_stall = false;
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
});
|
|
|
|
|
|
|
|
manual_compaction_thread.join();
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_EQ(0, NumTableFilesAtLevel(0));
|
|
|
|
ASSERT_GT(NumTableFilesAtLevel(1), 0);
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) {
|
|
|
|
// Verify that, when `CompactRangeOptions::allow_write_stall == false`, delay
|
|
|
|
// does not hang if CF is dropped or DB is closed
|
|
|
|
const int kNumL0FilesTrigger = 4;
|
|
|
|
const int kNumL0FilesLimit = 8;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
|
|
|
|
options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
|
|
|
|
// i == 0: DB::DropColumnFamily() on CompactRange's target CF unblocks it
|
|
|
|
// i == 1: DB::CancelAllBackgroundWork() unblocks CompactRange. This is to
|
|
|
|
// simulate what happens during Close as we can't call Close (it
|
|
|
|
// blocks on the auto-compaction, making a cycle).
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
CreateAndReopenWithCF({"one"}, options);
|
|
|
|
// The calls to close CF/DB wait until the manual compaction stalls.
|
|
|
|
// The auto-compaction waits until the manual compaction finishes to ensure
|
|
|
|
// the signal comes from closing CF/DB, not from compaction making progress.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
|
|
|
|
"DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"},
|
|
|
|
{"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual",
|
|
|
|
"CompactionJob::Run():End"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
|
|
|
|
for (int k = 0; k < 2; ++k) {
|
|
|
|
ASSERT_OK(Put(1, Key(k), rnd.RandomString(1024)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
}
|
|
|
|
auto manual_compaction_thread = port::Thread([this, i]() {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.allow_write_stall = false;
|
|
|
|
if (i == 0) {
|
|
|
|
ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
|
|
|
|
.IsColumnFamilyDropped());
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
|
|
|
|
.IsShutdownInProgress());
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown");
|
|
|
|
if (i == 0) {
|
|
|
|
ASSERT_OK(db_->DropColumnFamily(handles_[1]));
|
|
|
|
} else {
|
|
|
|
dbfull()->CancelAllBackgroundWork(false /* wait */);
|
|
|
|
}
|
|
|
|
manual_compaction_thread.join();
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual");
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) {
|
|
|
|
// Verify that, when `CompactRangeOptions::allow_write_stall == false`,
|
|
|
|
// CompactRange skips its flush if the delay is long enough that the memtables
|
|
|
|
// existing at the beginning of the call have already been flushed.
|
|
|
|
const int kNumL0FilesTrigger = 4;
|
|
|
|
const int kNumL0FilesLimit = 8;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
// The manual flush includes the memtable that was active when CompactRange
|
|
|
|
// began. So it unblocks CompactRange and precludes its flush. Throughout the
|
|
|
|
// test, stall conditions are upheld via high L0 file count.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
|
|
|
|
"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"},
|
|
|
|
{"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush",
|
|
|
|
"DBImpl::FlushMemTable:StallWaitDone"},
|
|
|
|
{"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// used for the delayable flushes
|
|
|
|
FlushOptions flush_opts;
|
|
|
|
flush_opts.allow_write_stall = true;
|
|
|
|
for (int i = 0; i < kNumL0FilesLimit - 1; ++i) {
|
|
|
|
for (int j = 0; j < 2; ++j) {
|
|
|
|
ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->Flush(flush_opts));
|
|
|
|
}
|
|
|
|
auto manual_compaction_thread = port::Thread([this]() {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.allow_write_stall = false;
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
});
|
|
|
|
|
|
|
|
TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush");
|
|
|
|
ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024)));
|
|
|
|
ASSERT_OK(dbfull()->Flush(flush_opts));
|
|
|
|
ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024)));
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush");
|
|
|
|
manual_compaction_thread.join();
|
|
|
|
|
|
|
|
// If CompactRange's flush was skipped, the final Put above will still be
|
|
|
|
// in the active memtable.
|
|
|
|
std::string num_keys_in_memtable;
|
|
|
|
ASSERT_TRUE(db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable,
|
|
|
|
&num_keys_in_memtable));
|
|
|
|
ASSERT_EQ(std::to_string(1), num_keys_in_memtable);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) {
|
|
|
|
// Verify memtable only gets flushed if it contains data overlapping the range
|
|
|
|
// provided to `CompactRange`. Tests all kinds of overlap/non-overlap.
|
|
|
|
const int kNumEndpointKeys = 5;
|
|
|
|
std::string keys[kNumEndpointKeys] = {"a", "b", "c", "d", "e"};
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// One extra iteration for nullptr, which means left side of interval is
|
|
|
|
// unbounded.
|
|
|
|
for (int i = 0; i <= kNumEndpointKeys; ++i) {
|
|
|
|
Slice begin;
|
|
|
|
Slice* begin_ptr;
|
|
|
|
if (i == 0) {
|
|
|
|
begin_ptr = nullptr;
|
|
|
|
} else {
|
|
|
|
begin = keys[i - 1];
|
|
|
|
begin_ptr = &begin;
|
|
|
|
}
|
|
|
|
// Start at `i` so right endpoint comes after left endpoint. One extra
|
|
|
|
// iteration for nullptr, which means right side of interval is unbounded.
|
|
|
|
for (int j = std::max(0, i - 1); j <= kNumEndpointKeys; ++j) {
|
|
|
|
Slice end;
|
|
|
|
Slice* end_ptr;
|
|
|
|
if (j == kNumEndpointKeys) {
|
|
|
|
end_ptr = nullptr;
|
|
|
|
} else {
|
|
|
|
end = keys[j];
|
|
|
|
end_ptr = &end;
|
|
|
|
}
|
|
|
|
ASSERT_OK(Put("b", "val"));
|
|
|
|
ASSERT_OK(Put("d", "val"));
|
|
|
|
CompactRangeOptions compact_range_opts;
|
|
|
|
ASSERT_OK(db_->CompactRange(compact_range_opts, begin_ptr, end_ptr));
|
|
|
|
|
|
|
|
uint64_t get_prop_tmp, num_memtable_entries = 0;
|
|
|
|
ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesImmMemTables,
|
|
|
|
&get_prop_tmp));
|
|
|
|
num_memtable_entries += get_prop_tmp;
|
|
|
|
ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
|
|
|
|
&get_prop_tmp));
|
|
|
|
num_memtable_entries += get_prop_tmp;
|
|
|
|
if (begin_ptr == nullptr || end_ptr == nullptr ||
|
|
|
|
(i <= 4 && j >= 1 && (begin != "c" || end != "c"))) {
|
|
|
|
// In this case `CompactRange`'s range overlapped in some way with the
|
|
|
|
// memtable's range, so flush should've happened. Then "b" and "d" won't
|
|
|
|
// be in the memtable.
|
|
|
|
ASSERT_EQ(0, num_memtable_entries);
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(2, num_memtable_entries);
|
|
|
|
// flush anyways to prepare for next iteration
|
|
|
|
ASSERT_OK(db_->Flush(FlushOptions()));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionStatsTest) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
CompactionStatsCollector* collector = new CompactionStatsCollector();
|
|
|
|
options.listeners.emplace_back(collector);
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
for (int i = 0; i < 32; i++) {
|
|
|
|
for (int j = 0; j < 5000; j++) {
|
|
|
|
ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ColumnFamilyHandleImpl* cfh =
|
|
|
|
static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
|
|
|
|
ColumnFamilyData* cfd = cfh->cfd();
|
|
|
|
|
|
|
|
VerifyCompactionStats(*cfd, *collector);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, SubcompactionEvent) {
|
|
|
|
class SubCompactionEventListener : public EventListener {
|
|
|
|
public:
|
|
|
|
void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
ASSERT_EQ(running_compactions_.find(ci.job_id),
|
|
|
|
running_compactions_.end());
|
|
|
|
running_compactions_.emplace(ci.job_id, std::unordered_set<int>());
|
|
|
|
}
|
|
|
|
|
|
|
|
void OnCompactionCompleted(DB* /*db*/,
|
|
|
|
const CompactionJobInfo& ci) override {
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
auto it = running_compactions_.find(ci.job_id);
|
|
|
|
ASSERT_NE(it, running_compactions_.end());
|
|
|
|
ASSERT_EQ(it->second.size(), 0);
|
|
|
|
running_compactions_.erase(it);
|
|
|
|
}
|
|
|
|
|
|
|
|
void OnSubcompactionBegin(const SubcompactionJobInfo& si) override {
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
auto it = running_compactions_.find(si.job_id);
|
|
|
|
ASSERT_NE(it, running_compactions_.end());
|
|
|
|
auto r = it->second.insert(si.subcompaction_job_id);
|
|
|
|
ASSERT_TRUE(r.second); // each subcompaction_job_id should be different
|
|
|
|
total_subcompaction_cnt_++;
|
|
|
|
}
|
|
|
|
|
|
|
|
void OnSubcompactionCompleted(const SubcompactionJobInfo& si) override {
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
auto it = running_compactions_.find(si.job_id);
|
|
|
|
ASSERT_NE(it, running_compactions_.end());
|
|
|
|
auto r = it->second.erase(si.subcompaction_job_id);
|
|
|
|
ASSERT_EQ(r, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t GetRunningCompactionCount() {
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
return running_compactions_.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t GetTotalSubcompactionCount() {
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
return total_subcompaction_cnt_;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
InstrumentedMutex mutex_;
|
|
|
|
std::unordered_map<int, std::unordered_set<int>> running_compactions_;
|
|
|
|
size_t total_subcompaction_cnt_ = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.target_file_size_base = 1024;
|
|
|
|
options.level0_file_num_compaction_trigger = 10;
|
|
|
|
auto* listener = new SubCompactionEventListener();
|
|
|
|
options.listeners.emplace_back(listener);
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// generate 4 files @ L2
|
|
|
|
for (int i = 0; i < 4; i++) {
|
|
|
|
for (int j = 0; j < 10; j++) {
|
|
|
|
int key_id = i * 10 + j;
|
|
|
|
ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
|
|
|
|
// generate 2 files @ L1 which overlaps with L2 files
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
for (int j = 0; j < 10; j++) {
|
|
|
|
int key_id = i * 20 + j * 2;
|
|
|
|
ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
ASSERT_EQ(FilesPerLevel(), "0,2,4");
|
|
|
|
|
|
|
|
CompactRangeOptions comp_opts;
|
|
|
|
comp_opts.max_subcompactions = 4;
|
|
|
|
Status s = dbfull()->CompactRange(comp_opts, nullptr, nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
// make sure there's no running compaction
|
|
|
|
ASSERT_EQ(listener->GetRunningCompactionCount(), 0);
|
|
|
|
// and sub compaction is triggered
|
|
|
|
ASSERT_GT(listener->GetTotalSubcompactionCount(), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) {
|
|
|
|
// LSM setup:
|
|
|
|
// L1: [ba bz]
|
|
|
|
// L2: [a b] [c d]
|
|
|
|
// L3: [a b] [c d]
|
|
|
|
//
|
|
|
|
// Thread 1: Thread 2:
|
|
|
|
// Begin compacting all L2->L3
|
|
|
|
// Compact [ba bz] L1->L3
|
|
|
|
// End compacting all L2->L3
|
|
|
|
//
|
|
|
|
// The compaction operation in thread 2 should be disallowed because the range
|
|
|
|
// overlaps with the compaction in thread 1, which also covers that range in
|
|
|
|
// L3.
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
FlushedFileCollector* collector = new FlushedFileCollector();
|
|
|
|
options.listeners.emplace_back(collector);
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
for (int level = 3; level >= 2; --level) {
|
|
|
|
ASSERT_OK(Put("a", "val"));
|
|
|
|
ASSERT_OK(Put("b", "val"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("c", "val"));
|
|
|
|
ASSERT_OK(Put("d", "val"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
MoveFilesToLevel(level);
|
|
|
|
}
|
|
|
|
ASSERT_OK(Put("ba", "val"));
|
|
|
|
ASSERT_OK(Put("bz", "val"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->LoadDependency({
|
|
|
|
{"CompactFilesImpl:0",
|
|
|
|
"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"},
|
|
|
|
{"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End",
|
|
|
|
"CompactFilesImpl:1"},
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
auto bg_thread = port::Thread([&]() {
|
|
|
|
// Thread 1
|
|
|
|
std::vector<std::string> filenames = collector->GetFlushedFiles();
|
|
|
|
filenames.pop_back();
|
|
|
|
ASSERT_OK(db_->CompactFiles(CompactionOptions(), filenames,
|
|
|
|
3 /* output_level */));
|
|
|
|
});
|
|
|
|
|
|
|
|
// Thread 2
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin");
|
|
|
|
std::string filename = collector->GetFlushedFiles().back();
|
|
|
|
ASSERT_FALSE(
|
|
|
|
db_->CompactFiles(CompactionOptions(), {filename}, 3 /* output_level */)
|
|
|
|
.ok());
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End");
|
|
|
|
|
|
|
|
bg_thread.join();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionHasEmptyOutput) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
SstStatsCollector* collector = new SstStatsCollector();
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.listeners.emplace_back(collector);
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// Make sure the L0 files overlap to prevent trivial move.
|
|
|
|
ASSERT_OK(Put("a", "val"));
|
|
|
|
ASSERT_OK(Put("b", "val"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Delete("a"));
|
|
|
|
ASSERT_OK(Delete("b"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(1), 0);
|
|
|
|
|
|
|
|
// Expect one file creation to start for each flush, and zero for compaction
|
|
|
|
// since no keys are written.
|
|
|
|
ASSERT_EQ(2, collector->num_ssts_creation_started());
|
|
|
|
}
|
|
|
|
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
TEST_F(DBCompactionTest, CompactionLimiter) {
|
|
|
|
const int kNumKeysPerFile = 10;
|
|
|
|
const int kMaxBackgroundThreads = 64;
|
|
|
|
|
|
|
|
struct CompactionLimiter {
|
|
|
|
std::string name;
|
|
|
|
int limit_tasks;
|
|
|
|
int max_tasks;
|
|
|
|
int tasks;
|
|
|
|
std::shared_ptr<ConcurrentTaskLimiter> limiter;
|
|
|
|
};
|
|
|
|
|
|
|
|
std::vector<CompactionLimiter> limiter_settings;
|
|
|
|
limiter_settings.push_back({"limiter_1", 1, 0, 0, nullptr});
|
|
|
|
limiter_settings.push_back({"limiter_2", 2, 0, 0, nullptr});
|
|
|
|
limiter_settings.push_back({"limiter_3", 3, 0, 0, nullptr});
|
|
|
|
|
|
|
|
for (auto& ls : limiter_settings) {
|
|
|
|
ls.limiter.reset(NewConcurrentTaskLimiter(ls.name, ls.limit_tasks));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::shared_ptr<ConcurrentTaskLimiter> unique_limiter(
|
|
|
|
NewConcurrentTaskLimiter("unique_limiter", -1));
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
|
|
|
|
const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5", "6", "7",
|
|
|
|
"8", "9", "a", "b", "c", "d", "e", "f"};
|
|
|
|
const unsigned int cf_count = sizeof cf_names / sizeof cf_names[0];
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
|
|
|
|
std::unordered_map<std::string, CompactionLimiter*> cf_to_limiter;
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 110 * 1024; // 110KB
|
|
|
|
options.arena_block_size = 4096;
|
|
|
|
options.num_levels = 3;
|
|
|
|
options.level0_file_num_compaction_trigger = 4;
|
|
|
|
options.level0_slowdown_writes_trigger = 64;
|
|
|
|
options.level0_stop_writes_trigger = 64;
|
|
|
|
options.max_background_jobs = kMaxBackgroundThreads; // Enough threads
|
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(kNumKeysPerFile));
|
|
|
|
options.max_write_buffer_number = 10; // Enough memtables
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
std::vector<Options> option_vector;
|
|
|
|
option_vector.reserve(cf_count);
|
|
|
|
|
|
|
|
for (unsigned int cf = 0; cf < cf_count; cf++) {
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
ColumnFamilyOptions cf_opt(options);
|
|
|
|
if (cf == 0) {
|
|
|
|
// "Default" CF does't use compaction limiter
|
|
|
|
cf_opt.compaction_thread_limiter = nullptr;
|
|
|
|
} else if (cf == 1) {
|
|
|
|
// "1" CF uses bypass compaction limiter
|
|
|
|
unique_limiter->SetMaxOutstandingTask(-1);
|
|
|
|
cf_opt.compaction_thread_limiter = unique_limiter;
|
|
|
|
} else {
|
|
|
|
// Assign limiter by mod
|
|
|
|
auto& ls = limiter_settings[cf % 3];
|
|
|
|
cf_opt.compaction_thread_limiter = ls.limiter;
|
|
|
|
cf_to_limiter[cf_names[cf]] = &ls;
|
|
|
|
}
|
|
|
|
option_vector.emplace_back(DBOptions(options), cf_opt);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned int cf = 1; cf < cf_count; cf++) {
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
CreateColumnFamilies({cf_names[cf]}, option_vector[cf]);
|
|
|
|
}
|
|
|
|
|
|
|
|
ReopenWithColumnFamilies(
|
|
|
|
std::vector<std::string>(cf_names, cf_names + cf_count), option_vector);
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
|
|
|
|
port::Mutex mutex;
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:BeforeCompaction", [&](void* arg) {
|
|
|
|
const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
|
|
|
|
auto iter = cf_to_limiter.find(cf_name);
|
|
|
|
if (iter != cf_to_limiter.end()) {
|
|
|
|
MutexLock l(&mutex);
|
|
|
|
ASSERT_GE(iter->second->limit_tasks, ++iter->second->tasks);
|
|
|
|
iter->second->max_tasks =
|
|
|
|
std::max(iter->second->max_tasks, iter->second->limit_tasks);
|
|
|
|
}
|
|
|
|
});
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:AfterCompaction", [&](void* arg) {
|
|
|
|
const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
|
|
|
|
auto iter = cf_to_limiter.find(cf_name);
|
|
|
|
if (iter != cf_to_limiter.end()) {
|
|
|
|
MutexLock l(&mutex);
|
|
|
|
ASSERT_GE(--iter->second->tasks, 0);
|
|
|
|
}
|
|
|
|
});
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
|
|
|
|
// Block all compact threads in thread pool.
|
|
|
|
const size_t kTotalFlushTasks = kMaxBackgroundThreads / 4;
|
|
|
|
const size_t kTotalCompactTasks = kMaxBackgroundThreads - kTotalFlushTasks;
|
|
|
|
env_->SetBackgroundThreads((int)kTotalFlushTasks, Env::HIGH);
|
|
|
|
env_->SetBackgroundThreads((int)kTotalCompactTasks, Env::LOW);
|
|
|
|
|
|
|
|
test::SleepingBackgroundTask sleeping_compact_tasks[kTotalCompactTasks];
|
|
|
|
|
|
|
|
// Block all compaction threads in thread pool.
|
|
|
|
for (size_t i = 0; i < kTotalCompactTasks; i++) {
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
|
|
|
|
&sleeping_compact_tasks[i], Env::LOW);
|
|
|
|
sleeping_compact_tasks[i].WaitUntilSleeping();
|
|
|
|
}
|
|
|
|
|
|
|
|
int keyIndex = 0;
|
|
|
|
|
|
|
|
for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) {
|
|
|
|
for (unsigned int cf = 0; cf < cf_count; cf++) {
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
for (int i = 0; i < kNumKeysPerFile; i++) {
|
|
|
|
ASSERT_OK(Put(cf, Key(keyIndex++), ""));
|
|
|
|
}
|
|
|
|
// put extra key to trigger flush
|
|
|
|
ASSERT_OK(Put(cf, "", ""));
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned int cf = 0; cf < cf_count; cf++) {
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Enough L0 files to trigger compaction
|
|
|
|
for (unsigned int cf = 0; cf < cf_count; cf++) {
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0, cf),
|
|
|
|
options.level0_file_num_compaction_trigger);
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// Create more files for one column family, which triggers speed up
|
|
|
|
// condition, all compactions will be scheduled.
|
|
|
|
for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
|
|
|
|
for (int i = 0; i < kNumKeysPerFile; i++) {
|
|
|
|
ASSERT_OK(Put(0, Key(i), ""));
|
|
|
|
}
|
|
|
|
// put extra key to trigger flush
|
|
|
|
ASSERT_OK(Put(0, "", ""));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
|
|
|
|
NumTableFilesAtLevel(0, 0));
|
|
|
|
}
|
|
|
|
|
|
|
|
// All CFs are pending compaction
|
|
|
|
ASSERT_EQ(cf_count, env_->GetThreadPoolQueueLen(Env::LOW));
|
|
|
|
|
|
|
|
// Unblock all compaction threads
|
|
|
|
for (size_t i = 0; i < kTotalCompactTasks; i++) {
|
|
|
|
sleeping_compact_tasks[i].WakeUp();
|
|
|
|
sleeping_compact_tasks[i].WaitUntilDone();
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned int cf = 0; cf < cf_count; cf++) {
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
|
|
|
|
// Max outstanding compact tasks reached limit
|
|
|
|
for (auto& ls : limiter_settings) {
|
|
|
|
ASSERT_EQ(ls.limit_tasks, ls.max_tasks);
|
|
|
|
ASSERT_EQ(0, ls.limiter->GetOutstandingTask());
|
|
|
|
}
|
|
|
|
|
|
|
|
// test manual compaction under a fully throttled limiter
|
|
|
|
int cf_test = 1;
|
|
|
|
unique_limiter->SetMaxOutstandingTask(0);
|
|
|
|
|
|
|
|
// flush one more file to cf 1
|
|
|
|
for (int i = 0; i < kNumKeysPerFile; i++) {
|
|
|
|
ASSERT_OK(Put(cf_test, Key(keyIndex++), ""));
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
}
|
|
|
|
// put extra key to trigger flush
|
|
|
|
ASSERT_OK(Put(cf_test, "", ""));
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]));
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test));
|
|
|
|
|
|
|
|
Compact(cf_test, Key(0), Key(keyIndex));
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
6 years ago
|
|
|
}
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam,
|
|
|
|
::testing::Values(std::make_tuple(1, true),
|
|
|
|
std::make_tuple(1, false),
|
|
|
|
std::make_tuple(4, true),
|
|
|
|
std::make_tuple(4, false)));
|
|
|
|
|
|
|
|
TEST_P(DBCompactionDirectIOTest, DirectIO) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
Destroy(options);
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.use_direct_io_for_flush_and_compaction = GetParam();
|
|
|
|
options.env = MockEnv::Create(Env::Default());
|
|
|
|
Reopen(options);
|
|
|
|
bool readahead = false;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"CompactionJob::OpenCompactionOutputFile", [&](void* arg) {
|
|
|
|
bool* use_direct_writes = static_cast<bool*>(arg);
|
|
|
|
ASSERT_EQ(*use_direct_writes,
|
|
|
|
options.use_direct_io_for_flush_and_compaction);
|
|
|
|
});
|
|
|
|
if (options.use_direct_io_for_flush_and_compaction) {
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"SanitizeOptions:direct_io", [&](void* /*arg*/) { readahead = true; });
|
|
|
|
}
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
MakeTables(3, "p", "q", 1);
|
|
|
|
ASSERT_EQ("1,1,1", FilesPerLevel(1));
|
|
|
|
Compact(1, "p", "q");
|
|
|
|
ASSERT_EQ(readahead, options.use_direct_reads);
|
|
|
|
ASSERT_EQ("0,0,1", FilesPerLevel(1));
|
|
|
|
Destroy(options);
|
|
|
|
delete options.env;
|
|
|
|
}
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest,
|
|
|
|
testing::Bool());
|
|
|
|
|
|
|
|
class CompactionPriTest : public DBTestBase,
|
|
|
|
public testing::WithParamInterface<uint32_t> {
|
|
|
|
public:
|
|
|
|
CompactionPriTest()
|
|
|
|
: DBTestBase("compaction_pri_test", /*env_do_fsync=*/true) {
|
|
|
|
compaction_pri_ = GetParam();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Required if inheriting from testing::WithParamInterface<>
|
|
|
|
static void SetUpTestCase() {}
|
|
|
|
static void TearDownTestCase() {}
|
|
|
|
|
|
|
|
uint32_t compaction_pri_;
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(CompactionPriTest, Test) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 16 * 1024;
|
|
|
|
options.compaction_pri = static_cast<CompactionPri>(compaction_pri_);
|
|
|
|
options.hard_pending_compaction_bytes_limit = 256 * 1024;
|
|
|
|
options.max_bytes_for_level_base = 64 * 1024;
|
|
|
|
options.max_bytes_for_level_multiplier = 4;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
const int kNKeys = 5000;
|
|
|
|
int keys[kNKeys];
|
|
|
|
for (int i = 0; i < kNKeys; i++) {
|
|
|
|
keys[i] = i;
|
|
|
|
}
|
|
|
|
RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
|
|
|
|
|
|
|
|
for (int i = 0; i < kNKeys; i++) {
|
|
|
|
ASSERT_OK(Put(Key(keys[i]), rnd.RandomString(102)));
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
for (int i = 0; i < kNKeys; i++) {
|
|
|
|
ASSERT_NE("NOT_FOUND", Get(Key(i)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
CompactionPriTest, CompactionPriTest,
|
|
|
|
::testing::Values(CompactionPri::kByCompensatedSize,
|
|
|
|
CompactionPri::kOldestLargestSeqFirst,
|
|
|
|
CompactionPri::kOldestSmallestSeqFirst,
|
|
|
|
CompactionPri::kMinOverlappingRatio,
|
|
|
|
CompactionPri::kRoundRobin));
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, PersistRoundRobinCompactCursor) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 16 * 1024;
|
|
|
|
options.max_bytes_for_level_base = 128 * 1024;
|
|
|
|
options.target_file_size_base = 64 * 1024;
|
|
|
|
options.level0_file_num_compaction_trigger = 4;
|
|
|
|
options.compaction_pri = CompactionPri::kRoundRobin;
|
|
|
|
options.max_bytes_for_level_multiplier = 4;
|
|
|
|
options.num_levels = 3;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
// 30 Files in L0 to trigger compactions between L1 and L2
|
|
|
|
for (int i = 0; i < 30; i++) {
|
|
|
|
for (int j = 0; j < 16; j++) {
|
|
|
|
ASSERT_OK(Put(rnd.RandomString(24), rnd.RandomString(1000)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
VersionSet* const versions = dbfull()->GetVersionSet();
|
|
|
|
assert(versions);
|
|
|
|
|
|
|
|
ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
|
|
|
|
ASSERT_NE(cfd, nullptr);
|
|
|
|
|
|
|
|
Version* const current = cfd->current();
|
|
|
|
ASSERT_NE(current, nullptr);
|
|
|
|
|
|
|
|
const VersionStorageInfo* const storage_info = current->storage_info();
|
|
|
|
ASSERT_NE(storage_info, nullptr);
|
|
|
|
|
|
|
|
const std::vector<InternalKey> compact_cursors =
|
|
|
|
storage_info->GetCompactCursors();
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
VersionSet* const reopened_versions = dbfull()->GetVersionSet();
|
|
|
|
assert(reopened_versions);
|
|
|
|
|
|
|
|
ColumnFamilyData* const reopened_cfd =
|
|
|
|
reopened_versions->GetColumnFamilySet()->GetDefault();
|
|
|
|
ASSERT_NE(reopened_cfd, nullptr);
|
|
|
|
|
|
|
|
Version* const reopened_current = reopened_cfd->current();
|
|
|
|
ASSERT_NE(reopened_current, nullptr);
|
|
|
|
|
|
|
|
const VersionStorageInfo* const reopened_storage_info =
|
|
|
|
reopened_current->storage_info();
|
|
|
|
ASSERT_NE(reopened_storage_info, nullptr);
|
|
|
|
|
|
|
|
const std::vector<InternalKey> reopened_compact_cursors =
|
|
|
|
reopened_storage_info->GetCompactCursors();
|
|
|
|
const auto icmp = reopened_storage_info->InternalComparator();
|
|
|
|
ASSERT_EQ(compact_cursors.size(), reopened_compact_cursors.size());
|
|
|
|
for (size_t i = 0; i < compact_cursors.size(); i++) {
|
|
|
|
if (compact_cursors[i].Valid()) {
|
|
|
|
ASSERT_EQ(0,
|
|
|
|
icmp->Compare(compact_cursors[i], reopened_compact_cursors[i]));
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(!reopened_compact_cursors[i].Valid());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Support subcmpct using reserved resources for round-robin priority (#10341)
Summary:
Earlier implementation of round-robin priority can only pick one file at a time and disallows parallel compactions within the same level. In this PR, round-robin compaction policy will expand towards more input files with respecting some additional constraints, which are summarized as follows:
* Constraint 1: We can only pick consecutive files
- Constraint 1a: When a file is being compacted (or some input files are being compacted after expanding), we cannot choose it and have to stop choosing more files
- Constraint 1b: When we reach the last file (with the largest keys), we cannot choose more files (the next file will be the first one with small keys)
* Constraint 2: We should ensure the total compaction bytes (including the overlapped files from the next level) is no more than `mutable_cf_options_.max_compaction_bytes`
* Constraint 3: We try our best to pick as many files as possible so that the post-compaction level size can be just less than `MaxBytesForLevel(start_level_)`
* Constraint 4: If trivial move is allowed, we reuse the logic of `TryNonL0TrivialMove()` instead of expanding files with Constraint 3
More details can be found in `LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion()`.
The above optimization accelerates the process of moving the compaction cursor, in which the write-amp can be further reduced. While a large compaction may lead to high write stall, we break this large compaction into several subcompactions **regardless of** the `max_subcompactions` limit. The number of subcompactions for round-robin compaction priority is determined through the following steps:
* Step 1: Initialized against `max_output_file_limit`, the number of input files in the start level, and also the range size limit `ranges.size()`
* Step 2: Call `AcquireSubcompactionResources()`when max subcompactions is not sufficient, but we may or may not obtain desired resources, additional number of resources is stored in `extra_num_subcompaction_threads_reserved_`). Subcompaction limit is changed and update `num_planned_subcompactions` with `GetSubcompactionLimit()`
* Step 3: Call `ShrinkSubcompactionResources()` to ensure extra resources can be released (extra resources may exist for round-robin compaction when the number of actual number of subcompactions is less than the number of planned subcompactions)
More details can be found in `CompactionJob::AcquireSubcompactionResources()`,`CompactionJob::ShrinkSubcompactionResources()`, and `CompactionJob::ReleaseSubcompactionResources()`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10341
Test Plan: Add `CompactionPriMultipleFilesRoundRobin[1-3]` unit test in `compaction_picker_test.cc` and `RoundRobinSubcompactionsAgainstResources.SubcompactionsUsingResources/[0-4]`, `RoundRobinSubcompactionsAgainstPressureToken.PressureTokenTest/[0-1]` in `db_compaction_test.cc`
Reviewed By: ajkr, hx235
Differential Revision: D37792644
Pulled By: littlepig2013
fbshipit-source-id: 7fecb7c4ffd97b34bbf6e3b760b2c35a772a0657
2 years ago
|
|
|
TEST_P(RoundRobinSubcompactionsAgainstPressureToken, PressureTokenTest) {
|
|
|
|
const int kKeysPerBuffer = 100;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.max_bytes_for_level_multiplier = 2;
|
|
|
|
options.level0_file_num_compaction_trigger = 4;
|
|
|
|
options.target_file_size_base = kKeysPerBuffer * 1024;
|
|
|
|
options.compaction_pri = CompactionPri::kRoundRobin;
|
|
|
|
options.max_bytes_for_level_base = 8 * kKeysPerBuffer * 1024;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
// Setup 7 threads but limited subcompactions so that
|
|
|
|
// RoundRobin requires extra compactions from reserved threads
|
|
|
|
options.max_subcompactions = 1;
|
|
|
|
options.max_background_compactions = 7;
|
|
|
|
options.max_compaction_bytes = 100000000;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
env_->SetBackgroundThreads(7, Env::LOW);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
const std::vector<int> files_per_level = {0, 15, 25};
|
|
|
|
for (int lvl = 2; lvl > 0; lvl--) {
|
|
|
|
for (int i = 0; i < files_per_level[lvl]; i++) {
|
|
|
|
for (int j = 0; j < kKeysPerBuffer; j++) {
|
|
|
|
// Add (lvl-1) to ensure nearly equivallent number of files
|
|
|
|
// in L2 are overlapped with fils selected to compact from
|
|
|
|
// L1
|
|
|
|
ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
|
|
|
|
rnd.RandomString(1010)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(lvl);
|
|
|
|
ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
|
|
|
|
}
|
|
|
|
// 15 files in L1; 25 files in L2
|
|
|
|
|
|
|
|
// This is a variable for making sure the following callback is called
|
|
|
|
// and the assertions in it are indeed excuted.
|
|
|
|
bool num_planned_subcompactions_verified = false;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
|
|
|
|
uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
|
|
|
|
if (grab_pressure_token_) {
|
|
|
|
// 7 files are selected for round-robin under auto
|
|
|
|
// compaction. The number of planned subcompaction is restricted by
|
|
|
|
// the limited number of max_background_compactions
|
|
|
|
ASSERT_EQ(num_planned_subcompactions, 7);
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(num_planned_subcompactions, 1);
|
|
|
|
}
|
|
|
|
num_planned_subcompactions_verified = true;
|
|
|
|
});
|
|
|
|
|
|
|
|
// The following 3 dependencies have to be added to ensure the auto
|
|
|
|
// compaction and the pressure token is correctly enabled. Same for
|
|
|
|
// RoundRobinSubcompactionsUsingResources and
|
|
|
|
// DBCompactionTest.RoundRobinSubcompactionsShrinkResources
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"RoundRobinSubcompactionsAgainstPressureToken:0",
|
|
|
|
"BackgroundCallCompaction:0"},
|
|
|
|
{"CompactionJob::AcquireSubcompactionResources:0",
|
|
|
|
"RoundRobinSubcompactionsAgainstPressureToken:1"},
|
|
|
|
{"RoundRobinSubcompactionsAgainstPressureToken:2",
|
|
|
|
"CompactionJob::AcquireSubcompactionResources:1"}});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
|
|
|
|
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:0");
|
|
|
|
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:1");
|
|
|
|
std::unique_ptr<WriteControllerToken> pressure_token;
|
|
|
|
if (grab_pressure_token_) {
|
|
|
|
pressure_token =
|
|
|
|
dbfull()->TEST_write_controler().GetCompactionPressureToken();
|
|
|
|
}
|
|
|
|
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:2");
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->WaitForCompact());
|
|
|
|
ASSERT_TRUE(num_planned_subcompactions_verified);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstPressureToken,
|
|
|
|
RoundRobinSubcompactionsAgainstPressureToken,
|
|
|
|
testing::Bool());
|
|
|
|
|
|
|
|
TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) {
|
|
|
|
const int kKeysPerBuffer = 200;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
options.target_file_size_base = kKeysPerBuffer * 1024;
|
|
|
|
options.compaction_pri = CompactionPri::kRoundRobin;
|
|
|
|
options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.max_subcompactions = 1;
|
|
|
|
options.max_background_compactions = max_compaction_limits_;
|
|
|
|
// Set a large number for max_compaction_bytes so that one round-robin
|
|
|
|
// compaction is enough to make post-compaction L1 size less than
|
|
|
|
// the maximum size (this test assumes only one round-robin compaction
|
|
|
|
// is triggered by kLevelMaxLevelSize)
|
|
|
|
options.max_compaction_bytes = 100000000;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
env_->SetBackgroundThreads(total_low_pri_threads_, Env::LOW);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
const std::vector<int> files_per_level = {0, 40, 100};
|
|
|
|
for (int lvl = 2; lvl > 0; lvl--) {
|
|
|
|
for (int i = 0; i < files_per_level[lvl]; i++) {
|
|
|
|
for (int j = 0; j < kKeysPerBuffer; j++) {
|
|
|
|
// Add (lvl-1) to ensure nearly equivallent number of files
|
|
|
|
// in L2 are overlapped with fils selected to compact from
|
|
|
|
// L1
|
|
|
|
ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
|
|
|
|
rnd.RandomString(1010)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(lvl);
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
|
|
|
|
}
|
|
|
|
|
|
|
|
// 40 files in L1; 100 files in L2
|
|
|
|
// This is a variable for making sure the following callback is called
|
|
|
|
// and the assertions in it are indeed excuted.
|
|
|
|
bool num_planned_subcompactions_verified = false;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
|
|
|
|
uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
|
|
|
|
// More than 10 files are selected for round-robin under auto
|
|
|
|
// compaction. The number of planned subcompaction is restricted by
|
|
|
|
// the minimum number between available threads and compaction limits
|
|
|
|
ASSERT_EQ(num_planned_subcompactions - options.max_subcompactions,
|
|
|
|
std::min(total_low_pri_threads_, max_compaction_limits_) - 1);
|
|
|
|
num_planned_subcompactions_verified = true;
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"RoundRobinSubcompactionsAgainstResources:0",
|
|
|
|
"BackgroundCallCompaction:0"},
|
|
|
|
{"CompactionJob::AcquireSubcompactionResources:0",
|
|
|
|
"RoundRobinSubcompactionsAgainstResources:1"},
|
|
|
|
{"RoundRobinSubcompactionsAgainstResources:2",
|
|
|
|
"CompactionJob::AcquireSubcompactionResources:1"},
|
|
|
|
{"CompactionJob::ReleaseSubcompactionResources:0",
|
|
|
|
"RoundRobinSubcompactionsAgainstResources:3"},
|
|
|
|
{"RoundRobinSubcompactionsAgainstResources:4",
|
|
|
|
"CompactionJob::ReleaseSubcompactionResources:1"}});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->WaitForCompact());
|
|
|
|
ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
|
|
|
|
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:0");
|
|
|
|
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:1");
|
|
|
|
auto pressure_token =
|
|
|
|
dbfull()->TEST_write_controler().GetCompactionPressureToken();
|
|
|
|
|
|
|
|
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:2");
|
|
|
|
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:3");
|
|
|
|
// We can reserve more threads now except one is being used
|
|
|
|
ASSERT_EQ(total_low_pri_threads_ - 1,
|
|
|
|
env_->ReserveThreads(total_low_pri_threads_, Env::Priority::LOW));
|
|
|
|
ASSERT_EQ(
|
|
|
|
total_low_pri_threads_ - 1,
|
|
|
|
env_->ReleaseThreads(total_low_pri_threads_ - 1, Env::Priority::LOW));
|
|
|
|
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:4");
|
|
|
|
ASSERT_OK(dbfull()->WaitForCompact());
|
|
|
|
ASSERT_TRUE(num_planned_subcompactions_verified);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstResources,
|
|
|
|
RoundRobinSubcompactionsAgainstResources,
|
|
|
|
::testing::Values(std::make_tuple(1, 5),
|
|
|
|
std::make_tuple(5, 1),
|
|
|
|
std::make_tuple(10, 5),
|
|
|
|
std::make_tuple(5, 10),
|
|
|
|
std::make_tuple(10, 10)));
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithParam, RoundRobinWithoutAdditionalResources) {
|
|
|
|
const int kKeysPerBuffer = 200;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.num_levels = 4;
|
|
|
|
options.level0_file_num_compaction_trigger = 3;
|
|
|
|
options.target_file_size_base = kKeysPerBuffer * 1024;
|
|
|
|
options.compaction_pri = CompactionPri::kRoundRobin;
|
|
|
|
options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
options.max_background_compactions = 1;
|
|
|
|
options.max_compaction_bytes = 100000000;
|
|
|
|
// Similar experiment setting as above except the max_subcompactions
|
|
|
|
// is given by max_subcompactions_ (1 or 4), and we fix the
|
|
|
|
// additional resources as (1, 1) and thus no more extra resources
|
|
|
|
// can be used
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
env_->SetBackgroundThreads(1, Env::LOW);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
const std::vector<int> files_per_level = {0, 33, 100};
|
|
|
|
for (int lvl = 2; lvl > 0; lvl--) {
|
|
|
|
for (int i = 0; i < files_per_level[lvl]; i++) {
|
|
|
|
for (int j = 0; j < kKeysPerBuffer; j++) {
|
|
|
|
// Add (lvl-1) to ensure nearly equivallent number of files
|
|
|
|
// in L2 are overlapped with fils selected to compact from
|
|
|
|
// L1
|
|
|
|
ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
|
|
|
|
rnd.RandomString(1010)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(lvl);
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
|
|
|
|
}
|
|
|
|
|
|
|
|
// 33 files in L1; 100 files in L2
|
|
|
|
// This is a variable for making sure the following callback is called
|
|
|
|
// and the assertions in it are indeed excuted.
|
|
|
|
bool num_planned_subcompactions_verified = false;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
|
|
|
|
uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
|
|
|
|
// At most 4 files are selected for round-robin under auto
|
|
|
|
// compaction. The number of planned subcompaction is restricted by
|
|
|
|
// the max_subcompactions since no extra resources can be used
|
|
|
|
ASSERT_EQ(num_planned_subcompactions, options.max_subcompactions);
|
|
|
|
num_planned_subcompactions_verified = true;
|
|
|
|
});
|
|
|
|
// No need to setup dependency for pressure token since
|
|
|
|
// AcquireSubcompactionResources may not be called and it anyway cannot
|
|
|
|
// reserve any additional resources
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBCompactionTest::RoundRobinWithoutAdditionalResources:0",
|
|
|
|
"BackgroundCallCompaction:0"}});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->WaitForCompact());
|
|
|
|
ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
|
|
|
|
TEST_SYNC_POINT("DBCompactionTest::RoundRobinWithoutAdditionalResources:0");
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->WaitForCompact());
|
|
|
|
ASSERT_TRUE(num_planned_subcompactions_verified);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, RoundRobinCutOutputAtCompactCursor) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.num_levels = 3;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.write_buffer_size = 4 * 1024;
|
|
|
|
options.max_bytes_for_level_base = 64 * 1024;
|
|
|
|
options.max_bytes_for_level_multiplier = 4;
|
|
|
|
options.level0_file_num_compaction_trigger = 4;
|
|
|
|
options.compaction_pri = CompactionPri::kRoundRobin;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
VersionSet* const versions = dbfull()->GetVersionSet();
|
|
|
|
assert(versions);
|
|
|
|
|
|
|
|
ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
|
|
|
|
ASSERT_NE(cfd, nullptr);
|
|
|
|
|
|
|
|
Version* const current = cfd->current();
|
|
|
|
ASSERT_NE(current, nullptr);
|
|
|
|
|
|
|
|
VersionStorageInfo* storage_info = current->storage_info();
|
|
|
|
ASSERT_NE(storage_info, nullptr);
|
|
|
|
|
|
|
|
const InternalKey split_cursor = InternalKey(Key(600), 100, kTypeValue);
|
|
|
|
storage_info->AddCursorForOneLevel(2, split_cursor);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
for (int i = 0; i < 50; i++) {
|
|
|
|
for (int j = 0; j < 50; j++) {
|
|
|
|
ASSERT_OK(Put(Key(j * 2 + i * 100), rnd.RandomString(102)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Add more overlapping files (avoid trivial move) to trigger compaction that
|
|
|
|
// output files in L2. Note that trivial move does not trigger compaction and
|
|
|
|
// in that case the cursor is not necessarily the boundary of file.
|
|
|
|
for (int i = 0; i < 50; i++) {
|
|
|
|
for (int j = 0; j < 50; j++) {
|
|
|
|
ASSERT_OK(Put(Key(j * 2 + 1 + i * 100), rnd.RandomString(1014)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
std::vector<std::vector<FileMetaData>> level_to_files;
|
|
|
|
dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
|
|
|
|
&level_to_files);
|
|
|
|
const auto icmp = cfd->current()->storage_info()->InternalComparator();
|
|
|
|
// Files in level 2 should be split by the cursor
|
|
|
|
for (const auto& file : level_to_files[2]) {
|
|
|
|
ASSERT_TRUE(
|
|
|
|
icmp->Compare(file.smallest.Encode(), split_cursor.Encode()) >= 0 ||
|
|
|
|
icmp->Compare(file.largest.Encode(), split_cursor.Encode()) < 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
class NoopMergeOperator : public MergeOperator {
|
|
|
|
public:
|
|
|
|
NoopMergeOperator() {}
|
|
|
|
|
|
|
|
bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
|
|
|
|
MergeOperationOutput* merge_out) const override {
|
|
|
|
std::string val("bar");
|
|
|
|
merge_out->new_value = val;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
const char* Name() const override { return "Noop"; }
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, PartialManualCompaction) {
|
|
|
|
Options opts = CurrentOptions();
|
|
|
|
opts.num_levels = 3;
|
|
|
|
opts.level0_file_num_compaction_trigger = 10;
|
|
|
|
opts.compression = kNoCompression;
|
|
|
|
opts.merge_operator.reset(new NoopMergeOperator());
|
|
|
|
opts.target_file_size_base = 10240;
|
|
|
|
DestroyAndReopen(opts);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (auto i = 0; i < 8; ++i) {
|
|
|
|
for (auto j = 0; j < 10; ++j) {
|
|
|
|
ASSERT_OK(Merge("foo", rnd.RandomString(1024)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
|
|
|
|
std::string prop;
|
|
|
|
EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop));
|
|
|
|
uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2;
|
|
|
|
ASSERT_OK(dbfull()->SetOptions(
|
|
|
|
{{"max_compaction_bytes", std::to_string(max_compaction_bytes)}}));
|
|
|
|
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
|
|
|
|
// Regression test for bug where manual compaction hangs forever when the DB
|
|
|
|
// is in read-only mode. Verify it now at least returns, despite failing.
|
|
|
|
const int kNumL0Files = 4;
|
|
|
|
std::unique_ptr<FaultInjectionTestEnv> mock_env(
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
4 years ago
|
|
|
new FaultInjectionTestEnv(env_));
|
|
|
|
Options opts = CurrentOptions();
|
|
|
|
opts.disable_auto_compactions = true;
|
|
|
|
opts.env = mock_env.get();
|
|
|
|
DestroyAndReopen(opts);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int i = 0; i < kNumL0Files; ++i) {
|
|
|
|
// Make sure files are overlapping in key-range to prevent trivial move.
|
|
|
|
ASSERT_OK(Put("key1", rnd.RandomString(1024)));
|
|
|
|
ASSERT_OK(Put("key2", rnd.RandomString(1024)));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0));
|
|
|
|
|
|
|
|
// Enter read-only mode by failing a write.
|
|
|
|
mock_env->SetFilesystemActive(false);
|
|
|
|
// Make sure this is outside `CompactRange`'s range so that it doesn't fail
|
|
|
|
// early trying to flush memtable.
|
|
|
|
ASSERT_NOK(Put("key3", rnd.RandomString(1024)));
|
|
|
|
|
|
|
|
// In the bug scenario, the first manual compaction would fail and forget to
|
|
|
|
// unregister itself, causing the second one to hang forever due to conflict
|
|
|
|
// with a non-running compaction.
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = false;
|
|
|
|
Slice begin_key("key1");
|
|
|
|
Slice end_key("key2");
|
|
|
|
ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
|
|
|
|
ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
|
|
|
|
|
|
|
|
// Close before mock_env destruct.
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
// ManualCompactionBottomLevelOptimization tests the bottom level manual
|
|
|
|
// compaction optimization to skip recompacting files created by Ln-1 to Ln
|
|
|
|
// compaction
|
|
|
|
TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) {
|
|
|
|
Options opts = CurrentOptions();
|
|
|
|
opts.num_levels = 3;
|
|
|
|
opts.level0_file_num_compaction_trigger = 5;
|
|
|
|
opts.compression = kNoCompression;
|
|
|
|
opts.merge_operator.reset(new NoopMergeOperator());
|
|
|
|
opts.target_file_size_base = 1024;
|
|
|
|
opts.max_bytes_for_level_multiplier = 2;
|
|
|
|
opts.disable_auto_compactions = true;
|
|
|
|
DestroyAndReopen(opts);
|
|
|
|
ColumnFamilyHandleImpl* cfh =
|
|
|
|
static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
|
|
|
|
ColumnFamilyData* cfd = cfh->cfd();
|
|
|
|
InternalStats* internal_stats_ptr = cfd->internal_stats();
|
|
|
|
ASSERT_NE(internal_stats_ptr, nullptr);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (auto i = 0; i < 8; ++i) {
|
|
|
|
for (auto j = 0; j < 10; ++j) {
|
|
|
|
ASSERT_OK(
|
|
|
|
Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
|
|
|
|
for (auto i = 0; i < 8; ++i) {
|
|
|
|
for (auto j = 0; j < 10; ++j) {
|
|
|
|
ASSERT_OK(
|
|
|
|
Put("bar" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
const std::vector<InternalStats::CompactionStats>& comp_stats =
|
|
|
|
internal_stats_ptr->TEST_GetCompactionStats();
|
|
|
|
int num = comp_stats[2].num_input_files_in_output_level;
|
|
|
|
ASSERT_EQ(num, 0);
|
|
|
|
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
|
|
|
|
const std::vector<InternalStats::CompactionStats>& comp_stats2 =
|
|
|
|
internal_stats_ptr->TEST_GetCompactionStats();
|
|
|
|
num = comp_stats2[2].num_input_files_in_output_level;
|
|
|
|
ASSERT_EQ(num, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, ManualCompactionMax) {
|
|
|
|
uint64_t l1_avg_size = 0, l2_avg_size = 0;
|
|
|
|
auto generate_sst_func = [&]() {
|
|
|
|
Random rnd(301);
|
|
|
|
for (auto i = 0; i < 100; i++) {
|
|
|
|
for (auto j = 0; j < 10; j++) {
|
|
|
|
ASSERT_OK(Put(Key(i * 10 + j), rnd.RandomString(1024)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
|
|
|
|
for (auto i = 0; i < 10; i++) {
|
|
|
|
for (auto j = 0; j < 10; j++) {
|
|
|
|
ASSERT_OK(Put(Key(i * 100 + j * 10), rnd.RandomString(1024)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
|
|
|
|
std::vector<std::vector<FileMetaData>> level_to_files;
|
|
|
|
dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
|
|
|
|
&level_to_files);
|
|
|
|
|
|
|
|
uint64_t total = 0;
|
|
|
|
for (const auto& file : level_to_files[1]) {
|
|
|
|
total += file.compensated_file_size;
|
|
|
|
}
|
|
|
|
l1_avg_size = total / level_to_files[1].size();
|
|
|
|
|
|
|
|
total = 0;
|
|
|
|
for (const auto& file : level_to_files[2]) {
|
|
|
|
total += file.compensated_file_size;
|
|
|
|
}
|
|
|
|
l2_avg_size = total / level_to_files[2].size();
|
|
|
|
};
|
|
|
|
|
|
|
|
std::atomic_int num_compactions(0);
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BGWorkCompaction", [&](void* /*arg*/) { ++num_compactions; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Options opts = CurrentOptions();
|
|
|
|
opts.disable_auto_compactions = true;
|
|
|
|
|
|
|
|
// with default setting (1.6G by default), it should cover all files in 1
|
|
|
|
// compaction
|
|
|
|
DestroyAndReopen(opts);
|
|
|
|
generate_sst_func();
|
|
|
|
num_compactions.store(0);
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
ASSERT_TRUE(num_compactions.load() == 1);
|
|
|
|
|
|
|
|
// split the compaction to 5
|
|
|
|
int num_split = 5;
|
|
|
|
DestroyAndReopen(opts);
|
|
|
|
generate_sst_func();
|
|
|
|
uint64_t total_size = (l1_avg_size * 10) + (l2_avg_size * 100);
|
|
|
|
opts.max_compaction_bytes = total_size / num_split;
|
Incremental Space Amp Compactions in Universal Style (#8655)
Summary:
This commit introduces incremental compaction in univeral style for space amplification. This follows the first improvement mentioned in https://rocksdb.org/blog/2021/04/12/universal-improvements.html . The implemention simply picks up files about size of max_compaction_bytes to compact and execute if the penalty is not too big. More optimizations can be done in the future, e.g. prioritizing between this compaction and other types. But for now, the feature is supposed to be functional and can often reduce frequency of full compactions, although it can introduce penalty.
In order to add cut files more efficiently so that more files from upper levels can be included, SST file cutting threshold (for current file + overlapping parent level files) is set to 1.5X of target file size. A 2MB target file size will generate files like this: https://gist.github.com/siying/29d2676fba417404f3c95e6c013c7de8 Number of files indeed increases but it is not out of control.
Two set of write benchmarks are run:
1. For ingestion rate limited scenario, we can see full compaction is mostly eliminated: https://gist.github.com/siying/959bc1186066906831cf4c808d6e0a19 . The write amp increased from 7.7 to 9.4, as expected. After applying file cutting, the number is improved to 8.9. In another benchmark, the write amp is even better with the incremental approach: https://gist.github.com/siying/d1c16c286d7c59c4d7bba718ca198163
2. For ingestion rate unlimited scenario, incremental compaction turns out to be too expensive most of the time and is not executed, as expected.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8655
Test Plan: Add unit tests to the functionality.
Reviewed By: ajkr
Differential Revision: D31787034
fbshipit-source-id: ce813e63b15a61d5a56e97bf8902a1b28e011beb
3 years ago
|
|
|
opts.target_file_size_base = total_size / num_split;
|
|
|
|
Reopen(opts);
|
|
|
|
num_compactions.store(0);
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
ASSERT_TRUE(num_compactions.load() == num_split);
|
|
|
|
|
|
|
|
// very small max_compaction_bytes, it should still move forward
|
|
|
|
opts.max_compaction_bytes = l1_avg_size / 2;
|
Incremental Space Amp Compactions in Universal Style (#8655)
Summary:
This commit introduces incremental compaction in univeral style for space amplification. This follows the first improvement mentioned in https://rocksdb.org/blog/2021/04/12/universal-improvements.html . The implemention simply picks up files about size of max_compaction_bytes to compact and execute if the penalty is not too big. More optimizations can be done in the future, e.g. prioritizing between this compaction and other types. But for now, the feature is supposed to be functional and can often reduce frequency of full compactions, although it can introduce penalty.
In order to add cut files more efficiently so that more files from upper levels can be included, SST file cutting threshold (for current file + overlapping parent level files) is set to 1.5X of target file size. A 2MB target file size will generate files like this: https://gist.github.com/siying/29d2676fba417404f3c95e6c013c7de8 Number of files indeed increases but it is not out of control.
Two set of write benchmarks are run:
1. For ingestion rate limited scenario, we can see full compaction is mostly eliminated: https://gist.github.com/siying/959bc1186066906831cf4c808d6e0a19 . The write amp increased from 7.7 to 9.4, as expected. After applying file cutting, the number is improved to 8.9. In another benchmark, the write amp is even better with the incremental approach: https://gist.github.com/siying/d1c16c286d7c59c4d7bba718ca198163
2. For ingestion rate unlimited scenario, incremental compaction turns out to be too expensive most of the time and is not executed, as expected.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8655
Test Plan: Add unit tests to the functionality.
Reviewed By: ajkr
Differential Revision: D31787034
fbshipit-source-id: ce813e63b15a61d5a56e97bf8902a1b28e011beb
3 years ago
|
|
|
opts.target_file_size_base = l1_avg_size / 2;
|
|
|
|
DestroyAndReopen(opts);
|
|
|
|
generate_sst_func();
|
|
|
|
num_compactions.store(0);
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
ASSERT_TRUE(num_compactions.load() > 10);
|
|
|
|
|
|
|
|
// dynamically set the option
|
|
|
|
num_split = 2;
|
|
|
|
opts.max_compaction_bytes = 0;
|
|
|
|
DestroyAndReopen(opts);
|
|
|
|
generate_sst_func();
|
|
|
|
total_size = (l1_avg_size * 10) + (l2_avg_size * 100);
|
|
|
|
Status s = db_->SetOptions(
|
Incremental Space Amp Compactions in Universal Style (#8655)
Summary:
This commit introduces incremental compaction in univeral style for space amplification. This follows the first improvement mentioned in https://rocksdb.org/blog/2021/04/12/universal-improvements.html . The implemention simply picks up files about size of max_compaction_bytes to compact and execute if the penalty is not too big. More optimizations can be done in the future, e.g. prioritizing between this compaction and other types. But for now, the feature is supposed to be functional and can often reduce frequency of full compactions, although it can introduce penalty.
In order to add cut files more efficiently so that more files from upper levels can be included, SST file cutting threshold (for current file + overlapping parent level files) is set to 1.5X of target file size. A 2MB target file size will generate files like this: https://gist.github.com/siying/29d2676fba417404f3c95e6c013c7de8 Number of files indeed increases but it is not out of control.
Two set of write benchmarks are run:
1. For ingestion rate limited scenario, we can see full compaction is mostly eliminated: https://gist.github.com/siying/959bc1186066906831cf4c808d6e0a19 . The write amp increased from 7.7 to 9.4, as expected. After applying file cutting, the number is improved to 8.9. In another benchmark, the write amp is even better with the incremental approach: https://gist.github.com/siying/d1c16c286d7c59c4d7bba718ca198163
2. For ingestion rate unlimited scenario, incremental compaction turns out to be too expensive most of the time and is not executed, as expected.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8655
Test Plan: Add unit tests to the functionality.
Reviewed By: ajkr
Differential Revision: D31787034
fbshipit-source-id: ce813e63b15a61d5a56e97bf8902a1b28e011beb
3 years ago
|
|
|
{{"max_compaction_bytes", std::to_string(total_size / num_split)},
|
|
|
|
{"target_file_size_base", std::to_string(total_size / num_split)}});
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
num_compactions.store(0);
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
ASSERT_TRUE(num_compactions.load() == num_split);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionDuringShutdown) {
|
|
|
|
Options opts = CurrentOptions();
|
|
|
|
opts.level0_file_num_compaction_trigger = 2;
|
|
|
|
opts.disable_auto_compactions = true;
|
|
|
|
DestroyAndReopen(opts);
|
|
|
|
ColumnFamilyHandleImpl* cfh =
|
|
|
|
static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
|
|
|
|
ColumnFamilyData* cfd = cfh->cfd();
|
|
|
|
InternalStats* internal_stats_ptr = cfd->internal_stats();
|
|
|
|
ASSERT_NE(internal_stats_ptr, nullptr);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (auto i = 0; i < 2; ++i) {
|
|
|
|
for (auto j = 0; j < 10; ++j) {
|
|
|
|
ASSERT_OK(
|
|
|
|
Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
|
|
|
|
[&](void* /*arg*/) { dbfull()->shutting_down_.store(true); });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
|
|
|
|
ASSERT_TRUE(s.ok() || s.IsShutdownInProgress());
|
|
|
|
ASSERT_OK(dbfull()->error_handler_.GetBGError());
|
|
|
|
}
|
|
|
|
|
|
|
|
// FixFileIngestionCompactionDeadlock tests and verifies that compaction and
|
|
|
|
// file ingestion do not cause deadlock in the event of write stall triggered
|
|
|
|
// by number of L0 files reaching level0_stop_writes_trigger.
|
|
|
|
TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
|
|
|
|
const int kNumKeysPerFile = 100;
|
|
|
|
// Generate SST files.
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
|
|
|
|
// Generate an external SST file containing a single key, i.e. 99
|
|
|
|
std::string sst_files_dir = dbname_ + "/sst_files/";
|
|
|
|
ASSERT_OK(DestroyDir(env_, sst_files_dir));
|
|
|
|
ASSERT_OK(env_->CreateDir(sst_files_dir));
|
|
|
|
SstFileWriter sst_writer(EnvOptions(), options);
|
|
|
|
const std::string sst_file_path = sst_files_dir + "test.sst";
|
|
|
|
ASSERT_OK(sst_writer.Open(sst_file_path));
|
|
|
|
ASSERT_OK(sst_writer.Put(Key(kNumKeysPerFile - 1), "value"));
|
|
|
|
ASSERT_OK(sst_writer.Finish());
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
SyncPoint::GetInstance()->LoadDependency({
|
|
|
|
{"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
|
|
|
|
"BackgroundCallCompaction:0"},
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
options.write_buffer_size = 110 << 10; // 110KB
|
|
|
|
options.level0_file_num_compaction_trigger =
|
|
|
|
options.level0_stop_writes_trigger;
|
|
|
|
options.max_subcompactions = max_subcompactions_;
|
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(kNumKeysPerFile));
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
// Generate level0_stop_writes_trigger L0 files to trigger write stop
|
|
|
|
for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
|
|
|
|
for (int j = 0; j != kNumKeysPerFile; ++j) {
|
|
|
|
ASSERT_OK(Put(Key(j), rnd.RandomString(990)));
|
|
|
|
}
|
|
|
|
if (i > 0) {
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// When we reach this point, there will be level0_stop_writes_trigger L0
|
|
|
|
// files and one extra key (99) in memory, which overlaps with the external
|
|
|
|
// SST file. Write stall triggers, and can be cleared only after compaction
|
|
|
|
// reduces the number of L0 files.
|
|
|
|
|
|
|
|
// Compaction will also be triggered since we have reached the threshold for
|
|
|
|
// auto compaction. Note that compaction may begin after the following file
|
|
|
|
// ingestion thread and waits for ingestion to finish.
|
|
|
|
|
|
|
|
// Thread to ingest file with overlapping key range with the current
|
|
|
|
// memtable. Consequently ingestion will trigger a flush. The flush MUST
|
|
|
|
// proceed without waiting for the write stall condition to clear, otherwise
|
|
|
|
// deadlock can happen.
|
|
|
|
port::Thread ingestion_thr([&]() {
|
|
|
|
IngestExternalFileOptions ifo;
|
|
|
|
Status s = db_->IngestExternalFile({sst_file_path}, ifo);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
});
|
|
|
|
|
|
|
|
// More write to trigger write stop
|
|
|
|
ingestion_thr.join();
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
Close();
|
|
|
|
}
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
|
Add missing range conflict check between file ingestion and RefitLevel() (#10988)
Summary:
**Context:**
File ingestion never checks whether the key range it acts on overlaps with an ongoing RefitLevel() (used in `CompactRange()` with `change_level=true`). That's because RefitLevel() doesn't register and make its key range known to file ingestion. Though it checks overlapping with other compactions by https://github.com/facebook/rocksdb/blob/7.8.fb/db/external_sst_file_ingestion_job.cc#L998.
RefitLevel() (used in `CompactRange()` with `change_level=true`) doesn't check whether the key range it acts on overlaps with an ongoing file ingestion. That's because file ingestion does not register and make its key range known to other compactions.
- Note that non-refitlevel-compaction (e.g, manual compaction w/o RefitLevel() or general compaction) also does not check key range overlap with ongoing file ingestion for the same reason.
- But it's fine. Credited to cbi42's discovery, `WaitForIngestFile` was called by background and foreground compactions. They were introduced in https://github.com/facebook/rocksdb/commit/0f88160f67d36ea30e3aca3a3cef924c3a009be6, https://github.com/facebook/rocksdb/commit/5c64fb67d2fc198f1a73ff3ae543749a6a41f513 and https://github.com/facebook/rocksdb/commit/87dfc1d23e0e16ff73e15f63c6fa0fb3b3fc8c8c.
- Regardless, this PR registers file ingestion like a compaction is a general approach that will also add range conflict check between file ingestion and non-refitlevel-compaction, though it has not been the issue motivated this PR.
Above are bugs resulting in two bad consequences:
- If file ingestion and RefitLevel() creates files in the same level, then range-overlapped files will be created at that level and caught as corruption by `force_consistency_checks=true`
- If file ingestion and RefitLevel() creates file in different levels, then with one further compaction on the ingested file, it can result in two same keys both with seqno 0 in two different levels. Then with iterator's [optimization](https://github.com/facebook/rocksdb/blame/c62f3221698fd273b673d4f7e54eabb8329a4369/db/db_iter.cc#L342-L343) that assumes no two same keys both with seqno 0, it will either break this assertion in debug build or, even worst, return value of this same key for the key after it, which is the wrong value to return, in release build.
Therefore we decide to introduce range conflict check for file ingestion and RefitLevel() inspired from the existing range conflict check among compactions.
**Summary:**
- Treat file ingestion job and RefitLevel() as `Compaction` of new compaction reasons: `CompactionReason::kExternalSstIngestion` and `CompactionReason::kRefitLevel` and register/unregister them. File ingestion is treated as compaction from L0 to different levels and RefitLevel() as compaction from source level to target level.
- Check for `RangeOverlapWithCompaction` with other ongoing compactions, `RegisterCompaction()` on this "compaction" before changing the LSM state in `VersionStorageInfo`, and `UnregisterCompaction()` after changing.
- Replace scattered fixes (https://github.com/facebook/rocksdb/commit/0f88160f67d36ea30e3aca3a3cef924c3a009be6, https://github.com/facebook/rocksdb/commit/5c64fb67d2fc198f1a73ff3ae543749a6a41f513 and https://github.com/facebook/rocksdb/commit/87dfc1d23e0e16ff73e15f63c6fa0fb3b3fc8c8c.) that prevents overlapping between file ingestion and non-refit-level compaction with this fix cuz those practices are easy to overlook.
- Misc: logic cleanup, see PR comments
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10988
Test Plan:
- New unit test `DBCompactionTestWithOngoingFileIngestionParam*` that failed pre-fix and passed afterwards.
- Made compatible with existing tests, see PR comments
- make check
- [Ongoing] Stress test rehearsal with normal value and aggressive CI value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: cbi42
Differential Revision: D41535685
Pulled By: hx235
fbshipit-source-id: 549833a577ba1496d20a870583d4caa737da1258
2 years ago
|
|
|
class DBCompactionTestWithOngoingFileIngestionParam
|
|
|
|
: public DBCompactionTest,
|
|
|
|
public testing::WithParamInterface<std::string> {
|
|
|
|
public:
|
|
|
|
DBCompactionTestWithOngoingFileIngestionParam() : DBCompactionTest() {
|
|
|
|
compaction_path_to_test_ = GetParam();
|
|
|
|
}
|
|
|
|
void SetupOptions() {
|
|
|
|
options_ = CurrentOptions();
|
|
|
|
options_.create_if_missing = true;
|
|
|
|
|
|
|
|
if (compaction_path_to_test_ == "RefitLevelCompactRange") {
|
|
|
|
options_.num_levels = 7;
|
|
|
|
} else {
|
|
|
|
options_.num_levels = 3;
|
|
|
|
}
|
|
|
|
options_.compaction_style = CompactionStyle::kCompactionStyleLevel;
|
|
|
|
if (compaction_path_to_test_ == "AutoCompaction") {
|
|
|
|
options_.disable_auto_compactions = false;
|
|
|
|
options_.level0_file_num_compaction_trigger = 1;
|
|
|
|
} else {
|
|
|
|
options_.disable_auto_compactions = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void PauseCompactionThread() {
|
|
|
|
sleeping_task_.reset(new test::SleepingBackgroundTask());
|
|
|
|
env_->SetBackgroundThreads(1, Env::LOW);
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
|
|
|
|
sleeping_task_.get(), Env::Priority::LOW);
|
|
|
|
sleeping_task_->WaitUntilSleeping();
|
|
|
|
}
|
|
|
|
|
|
|
|
void ResumeCompactionThread() {
|
|
|
|
if (sleeping_task_) {
|
|
|
|
sleeping_task_->WakeUp();
|
|
|
|
sleeping_task_->WaitUntilDone();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetupFilesToForceFutureFilesIngestedToCertainLevel() {
|
|
|
|
SstFileWriter sst_file_writer(EnvOptions(), options_);
|
|
|
|
std::string dummy = dbname_ + "/dummy.sst";
|
|
|
|
ASSERT_OK(sst_file_writer.Open(dummy));
|
|
|
|
ASSERT_OK(sst_file_writer.Put("k2", "dummy"));
|
|
|
|
ASSERT_OK(sst_file_writer.Finish());
|
|
|
|
ASSERT_OK(db_->IngestExternalFile({dummy}, IngestExternalFileOptions()));
|
|
|
|
// L2 is made to contain a file overlapped with files to be ingested in
|
|
|
|
// later steps on key "k2". This will force future files ingested to L1 or
|
|
|
|
// above.
|
|
|
|
ASSERT_EQ("0,0,1", FilesPerLevel(0));
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetupSyncPoints() {
|
|
|
|
if (compaction_path_to_test_ == "AutoCompaction") {
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"ExternalSstFileIngestionJob::Run", [&](void*) {
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::BackgroundCompaction():AfterPickCompaction",
|
|
|
|
"VersionSet::LogAndApply:WriteManifest"}});
|
|
|
|
});
|
|
|
|
} else if (compaction_path_to_test_ == "NonRefitLevelCompactRange") {
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"ExternalSstFileIngestionJob::Run", [&](void*) {
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"ColumnFamilyData::CompactRange:Return",
|
|
|
|
"VersionSet::LogAndApply:WriteManifest"}});
|
|
|
|
});
|
|
|
|
} else if (compaction_path_to_test_ == "RefitLevelCompactRange") {
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"ExternalSstFileIngestionJob::Run", [&](void*) {
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::CompactRange:PostRefitLevel",
|
|
|
|
"VersionSet::LogAndApply:WriteManifest"}});
|
|
|
|
});
|
|
|
|
} else if (compaction_path_to_test_ == "CompactFiles") {
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"ExternalSstFileIngestionJob::Run", [&](void*) {
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::CompactFilesImpl::PostSanitizeCompactionInputFiles",
|
|
|
|
"VersionSet::LogAndApply:WriteManifest"}});
|
|
|
|
});
|
|
|
|
} else {
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"ExternalSstFileIngestionJob::Run", "PreCompaction"}});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
void RunCompactionOverlappedWithFileIngestion() {
|
|
|
|
if (compaction_path_to_test_ == "AutoCompaction") {
|
|
|
|
TEST_SYNC_POINT("PreCompaction");
|
|
|
|
ResumeCompactionThread();
|
|
|
|
// Without proper range conflict check,
|
|
|
|
// this would have been `Status::Corruption` about overlapping ranges
|
|
|
|
Status s = dbfull()->TEST_WaitForCompact();
|
|
|
|
EXPECT_OK(s);
|
|
|
|
} else if (compaction_path_to_test_ == "NonRefitLevelCompactRange") {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = false;
|
|
|
|
std::string start_key = "k1";
|
|
|
|
Slice start(start_key);
|
|
|
|
std::string end_key = "k4";
|
|
|
|
Slice end(end_key);
|
|
|
|
TEST_SYNC_POINT("PreCompaction");
|
|
|
|
// Without proper range conflict check,
|
|
|
|
// this would have been `Status::Corruption` about overlapping ranges
|
|
|
|
Status s = dbfull()->CompactRange(cro, &start, &end);
|
|
|
|
EXPECT_OK(s);
|
|
|
|
} else if (compaction_path_to_test_ == "RefitLevelCompactRange") {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = true;
|
|
|
|
cro.target_level = 5;
|
|
|
|
std::string start_key = "k1";
|
|
|
|
Slice start(start_key);
|
|
|
|
std::string end_key = "k4";
|
|
|
|
Slice end(end_key);
|
|
|
|
TEST_SYNC_POINT("PreCompaction");
|
|
|
|
Status s = dbfull()->CompactRange(cro, &start, &end);
|
|
|
|
// Without proper range conflict check,
|
|
|
|
// this would have been `Status::Corruption` about overlapping ranges
|
|
|
|
// To see this, remove the fix AND replace
|
|
|
|
// `DBImpl::CompactRange:PostRefitLevel` in sync point dependency with
|
|
|
|
// `DBImpl::ReFitLevel:PostRegisterCompaction`
|
|
|
|
EXPECT_TRUE(s.IsNotSupported());
|
|
|
|
EXPECT_TRUE(s.ToString().find("some ongoing compaction's output") !=
|
|
|
|
std::string::npos);
|
|
|
|
} else if (compaction_path_to_test_ == "CompactFiles") {
|
|
|
|
ColumnFamilyMetaData cf_meta_data;
|
|
|
|
db_->GetColumnFamilyMetaData(&cf_meta_data);
|
|
|
|
ASSERT_EQ(cf_meta_data.levels[0].files.size(), 1);
|
|
|
|
std::vector<std::string> input_files;
|
|
|
|
for (const auto& file : cf_meta_data.levels[0].files) {
|
|
|
|
input_files.push_back(file.name);
|
|
|
|
}
|
|
|
|
TEST_SYNC_POINT("PreCompaction");
|
|
|
|
Status s = db_->CompactFiles(CompactionOptions(), input_files, 1);
|
|
|
|
// Without proper range conflict check,
|
|
|
|
// this would have been `Status::Corruption` about overlapping ranges
|
|
|
|
EXPECT_TRUE(s.IsAborted());
|
|
|
|
EXPECT_TRUE(
|
|
|
|
s.ToString().find(
|
|
|
|
"A running compaction is writing to the same output level") !=
|
|
|
|
std::string::npos);
|
|
|
|
} else {
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void DisableSyncPoints() {
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
protected:
|
|
|
|
std::string compaction_path_to_test_;
|
|
|
|
Options options_;
|
|
|
|
std::shared_ptr<test::SleepingBackgroundTask> sleeping_task_;
|
|
|
|
};
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(DBCompactionTestWithOngoingFileIngestionParam,
|
|
|
|
DBCompactionTestWithOngoingFileIngestionParam,
|
|
|
|
::testing::Values("AutoCompaction",
|
|
|
|
"NonRefitLevelCompactRange",
|
|
|
|
"RefitLevelCompactRange",
|
|
|
|
"CompactFiles"));
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithOngoingFileIngestionParam, RangeConflictCheck) {
|
|
|
|
SetupOptions();
|
|
|
|
DestroyAndReopen(options_);
|
|
|
|
|
|
|
|
if (compaction_path_to_test_ == "AutoCompaction") {
|
|
|
|
PauseCompactionThread();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (compaction_path_to_test_ != "RefitLevelCompactRange") {
|
|
|
|
SetupFilesToForceFutureFilesIngestedToCertainLevel();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create s1
|
|
|
|
ASSERT_OK(Put("k1", "v"));
|
|
|
|
ASSERT_OK(Put("k4", "v"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
if (compaction_path_to_test_ == "RefitLevelCompactRange") {
|
|
|
|
MoveFilesToLevel(6 /* level */);
|
|
|
|
ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel(0));
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ("1,0,1", FilesPerLevel(0));
|
|
|
|
}
|
|
|
|
|
|
|
|
// To coerce following sequence of events
|
|
|
|
// Timeline Thread 1 (Ingest s2) Thread 2 (Compact s1)
|
|
|
|
// t0 | Decide to output to Lk
|
|
|
|
// t1 | Release lock in LogAndApply()
|
|
|
|
// t2 | Acquire lock
|
|
|
|
// t3 | Decides to compact to Lk
|
|
|
|
// | Expected to fail due to range
|
|
|
|
// | conflict check with file
|
|
|
|
// | ingestion
|
|
|
|
// t4 | Release lock in LogAndApply()
|
|
|
|
// t5 | Acquire lock again and finish
|
|
|
|
// t6 | Acquire lock again and finish
|
|
|
|
SetupSyncPoints();
|
|
|
|
|
|
|
|
// Ingest s2
|
|
|
|
port::Thread thread1([&] {
|
|
|
|
SstFileWriter sst_file_writer(EnvOptions(), options_);
|
|
|
|
std::string s2 = dbname_ + "/ingested_s2.sst";
|
|
|
|
ASSERT_OK(sst_file_writer.Open(s2));
|
|
|
|
ASSERT_OK(sst_file_writer.Put("k2", "v2"));
|
|
|
|
ASSERT_OK(sst_file_writer.Put("k3", "v2"));
|
|
|
|
ASSERT_OK(sst_file_writer.Finish());
|
|
|
|
ASSERT_OK(db_->IngestExternalFile({s2}, IngestExternalFileOptions()));
|
|
|
|
});
|
|
|
|
|
|
|
|
// Compact s1. Without proper range conflict check,
|
|
|
|
// this will encounter overlapping file corruption.
|
|
|
|
port::Thread thread2([&] { RunCompactionOverlappedWithFileIngestion(); });
|
|
|
|
|
|
|
|
thread1.join();
|
|
|
|
thread2.join();
|
|
|
|
DisableSyncPoints();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, ConsistencyFailTest) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.force_consistency_checks = true;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionBuilder::CheckConsistency0", [&](void* arg) {
|
|
|
|
auto p =
|
|
|
|
reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
|
|
|
|
// just swap the two FileMetaData so that we hit error
|
|
|
|
// in CheckConsistency funcion
|
|
|
|
FileMetaData* temp = *(p->first);
|
|
|
|
*(p->first) = *(p->second);
|
|
|
|
*(p->second) = temp;
|
|
|
|
});
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
for (int k = 0; k < 2; ++k) {
|
|
|
|
ASSERT_OK(Put("foo", "bar"));
|
|
|
|
Status s = Flush();
|
|
|
|
if (k < 1) {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(s.IsCorruption());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_NOK(Put("foo", "bar"));
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, ConsistencyFailTest2) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.force_consistency_checks = true;
|
|
|
|
options.target_file_size_base = 1000;
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.block_size = 400; // small block size
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
4 years ago
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionBuilder::CheckConsistency1", [&](void* arg) {
|
|
|
|
auto p =
|
|
|
|
reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
|
|
|
|
// just swap the two FileMetaData so that we hit error
|
|
|
|
// in CheckConsistency funcion
|
|
|
|
FileMetaData* temp = *(p->first);
|
|
|
|
*(p->first) = *(p->second);
|
|
|
|
*(p->second) = temp;
|
|
|
|
});
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
std::string value = rnd.RandomString(1000);
|
|
|
|
|
|
|
|
ASSERT_OK(Put("foo1", value));
|
|
|
|
ASSERT_OK(Put("z", ""));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("foo2", value));
|
|
|
|
ASSERT_OK(Put("z", ""));
|
|
|
|
Status s = Flush();
|
|
|
|
ASSERT_TRUE(s.ok() || s.IsCorruption());
|
|
|
|
|
|
|
|
// This probably returns non-OK, but we rely on the next Put()
|
|
|
|
// to determine the DB is frozen.
|
|
|
|
ASSERT_NOK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_NOK(Put("foo", "bar"));
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
|
|
|
|
void IngestOneKeyValue(DBImpl* db, const std::string& key,
|
|
|
|
const std::string& value, const Options& options) {
|
|
|
|
ExternalSstFileInfo info;
|
|
|
|
std::string f = test::PerThreadDBPath("sst_file" + key);
|
|
|
|
EnvOptions env;
|
|
|
|
ROCKSDB_NAMESPACE::SstFileWriter writer(env, options);
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
auto s = writer.Open(f);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
// ASSERT_OK(writer.Put(Key(), ""));
|
|
|
|
ASSERT_OK(writer.Put(key, value));
|
|
|
|
|
|
|
|
ASSERT_OK(writer.Finish(&info));
|
|
|
|
IngestExternalFileOptions ingest_opt;
|
|
|
|
|
|
|
|
ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt));
|
|
|
|
}
|
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest {
|
|
|
|
public:
|
|
|
|
DBCompactionTestL0FilesMisorderCorruption() : DBCompactionTest() {}
|
|
|
|
void SetupOptions(const CompactionStyle compaciton_style,
|
|
|
|
const std::string& compaction_path_to_test = "") {
|
|
|
|
options_ = CurrentOptions();
|
|
|
|
options_.create_if_missing = true;
|
|
|
|
options_.compression = kNoCompression;
|
|
|
|
|
|
|
|
options_.force_consistency_checks = true;
|
|
|
|
options_.compaction_style = compaciton_style;
|
|
|
|
|
|
|
|
if (compaciton_style == CompactionStyle::kCompactionStyleLevel) {
|
|
|
|
options_.num_levels = 7;
|
|
|
|
// Level compaction's PickIntraL0Compaction() impl detail requires
|
|
|
|
// `options.level0_file_num_compaction_trigger` to be
|
|
|
|
// at least 2 files less than the actual number of level 0 files
|
|
|
|
// (i.e, 7 by design in this test)
|
|
|
|
options_.level0_file_num_compaction_trigger = 5;
|
|
|
|
options_.max_background_compactions = 2;
|
|
|
|
options_.write_buffer_size = 2 << 20;
|
|
|
|
options_.max_write_buffer_number = 6;
|
|
|
|
} else if (compaciton_style == CompactionStyle::kCompactionStyleUniversal) {
|
|
|
|
// TODO: expand test coverage to num_lvels > 1 for universal compacion,
|
|
|
|
// which requires careful unit test design to compact to level 0 despite
|
|
|
|
// num_levels > 1
|
|
|
|
options_.num_levels = 1;
|
|
|
|
options_.level0_file_num_compaction_trigger = 5;
|
|
|
|
|
|
|
|
CompactionOptionsUniversal universal_options;
|
|
|
|
if (compaction_path_to_test == "PickCompactionToReduceSizeAmp") {
|
|
|
|
universal_options.max_size_amplification_percent = 50;
|
|
|
|
} else if (compaction_path_to_test ==
|
|
|
|
"PickCompactionToReduceSortedRuns") {
|
|
|
|
universal_options.max_size_amplification_percent = 400;
|
|
|
|
} else if (compaction_path_to_test == "PickDeleteTriggeredCompaction") {
|
|
|
|
universal_options.max_size_amplification_percent = 400;
|
|
|
|
universal_options.min_merge_width = 6;
|
|
|
|
}
|
|
|
|
options_.compaction_options_universal = universal_options;
|
|
|
|
} else if (compaciton_style == CompactionStyle::kCompactionStyleFIFO) {
|
|
|
|
options_.max_open_files = -1;
|
|
|
|
options_.num_levels = 1;
|
|
|
|
options_.level0_file_num_compaction_trigger = 3;
|
|
|
|
|
|
|
|
CompactionOptionsFIFO fifo_options;
|
|
|
|
if (compaction_path_to_test == "FindIntraL0Compaction" ||
|
|
|
|
compaction_path_to_test == "CompactRange") {
|
|
|
|
fifo_options.allow_compaction = true;
|
|
|
|
fifo_options.age_for_warm = 0;
|
|
|
|
} else if (compaction_path_to_test == "CompactFile") {
|
|
|
|
fifo_options.allow_compaction = false;
|
|
|
|
fifo_options.age_for_warm = 0;
|
|
|
|
}
|
|
|
|
options_.compaction_options_fifo = fifo_options;
|
|
|
|
}
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
if (compaction_path_to_test == "CompactFile" ||
|
|
|
|
compaction_path_to_test == "CompactRange") {
|
|
|
|
options_.disable_auto_compactions = true;
|
|
|
|
} else {
|
|
|
|
options_.disable_auto_compactions = false;
|
|
|
|
}
|
|
|
|
}
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
void Destroy(const Options& options) {
|
|
|
|
if (snapshot_) {
|
|
|
|
assert(db_);
|
|
|
|
db_->ReleaseSnapshot(snapshot_);
|
|
|
|
snapshot_ = nullptr;
|
|
|
|
}
|
|
|
|
DBTestBase::Destroy(options);
|
|
|
|
}
|
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
void Reopen(const Options& options) {
|
|
|
|
DBTestBase::Reopen(options);
|
|
|
|
if (options.compaction_style != CompactionStyle::kCompactionStyleLevel) {
|
|
|
|
// To force assigning the global seqno to ingested file
|
|
|
|
// for our test purpose.
|
|
|
|
assert(snapshot_ == nullptr);
|
|
|
|
snapshot_ = db_->GetSnapshot();
|
|
|
|
}
|
|
|
|
}
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
void DestroyAndReopen(Options& options) {
|
|
|
|
Destroy(options);
|
|
|
|
Reopen(options);
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
}
|
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
void PauseCompactionThread() {
|
|
|
|
sleeping_task_.reset(new test::SleepingBackgroundTask());
|
|
|
|
env_->SetBackgroundThreads(1, Env::LOW);
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
|
|
|
|
sleeping_task_.get(), Env::Priority::LOW);
|
|
|
|
sleeping_task_->WaitUntilSleeping();
|
|
|
|
}
|
|
|
|
|
|
|
|
void ResumeCompactionThread() {
|
|
|
|
if (sleeping_task_) {
|
|
|
|
sleeping_task_->WakeUp();
|
|
|
|
sleeping_task_->WaitUntilDone();
|
|
|
|
}
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
}
|
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
void AddFilesMarkedForPeriodicCompaction(const size_t num_files) {
|
|
|
|
assert(options_.compaction_style ==
|
|
|
|
CompactionStyle::kCompactionStyleUniversal);
|
|
|
|
VersionSet* const versions = dbfull()->GetVersionSet();
|
|
|
|
assert(versions);
|
|
|
|
ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
|
|
|
|
assert(cfd);
|
|
|
|
Version* const current = cfd->current();
|
|
|
|
assert(current);
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
VersionStorageInfo* const storage_info = current->storage_info();
|
|
|
|
assert(storage_info);
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
const std::vector<FileMetaData*> level0_files = storage_info->LevelFiles(0);
|
|
|
|
assert(level0_files.size() == num_files);
|
|
|
|
|
|
|
|
for (FileMetaData* f : level0_files) {
|
|
|
|
storage_info->TEST_AddFileMarkedForPeriodicCompaction(0, f);
|
|
|
|
}
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
}
|
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
void AddFilesMarkedForCompaction(const size_t num_files) {
|
|
|
|
assert(options_.compaction_style ==
|
|
|
|
CompactionStyle::kCompactionStyleUniversal);
|
|
|
|
VersionSet* const versions = dbfull()->GetVersionSet();
|
|
|
|
assert(versions);
|
|
|
|
ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
|
|
|
|
assert(cfd);
|
|
|
|
Version* const current = cfd->current();
|
|
|
|
assert(current);
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
VersionStorageInfo* const storage_info = current->storage_info();
|
|
|
|
assert(storage_info);
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
const std::vector<FileMetaData*> level0_files = storage_info->LevelFiles(0);
|
|
|
|
assert(level0_files.size() == num_files);
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
for (FileMetaData* f : level0_files) {
|
|
|
|
storage_info->TEST_AddFileMarkedForCompaction(0, f);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetupSyncPoints(const std::string& compaction_path_to_test) {
|
|
|
|
compaction_path_sync_point_called_.store(false);
|
|
|
|
if (compaction_path_to_test == "FindIntraL0Compaction" &&
|
|
|
|
options_.compaction_style == CompactionStyle::kCompactionStyleLevel) {
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"PostPickFileToCompact", [&](void* arg) {
|
|
|
|
bool* picked_file_to_compact = (bool*)arg;
|
|
|
|
// To trigger intra-L0 compaction specifically,
|
|
|
|
// we mock PickFileToCompact()'s result to be false
|
|
|
|
*picked_file_to_compact = false;
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"FindIntraL0Compaction", [&](void* /*arg*/) {
|
|
|
|
compaction_path_sync_point_called_.store(true);
|
|
|
|
});
|
|
|
|
|
|
|
|
} else if (compaction_path_to_test == "PickPeriodicCompaction") {
|
|
|
|
assert(options_.compaction_style ==
|
|
|
|
CompactionStyle::kCompactionStyleUniversal);
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"PostPickPeriodicCompaction", [&](void* compaction_arg) {
|
|
|
|
Compaction* compaction = (Compaction*)compaction_arg;
|
|
|
|
if (compaction != nullptr) {
|
|
|
|
compaction_path_sync_point_called_.store(true);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
} else if (compaction_path_to_test == "PickCompactionToReduceSizeAmp") {
|
|
|
|
assert(options_.compaction_style ==
|
|
|
|
CompactionStyle::kCompactionStyleUniversal);
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"PickCompactionToReduceSizeAmpReturnNonnullptr", [&](void* /*arg*/) {
|
|
|
|
compaction_path_sync_point_called_.store(true);
|
|
|
|
});
|
|
|
|
} else if (compaction_path_to_test == "PickCompactionToReduceSortedRuns") {
|
|
|
|
assert(options_.compaction_style ==
|
|
|
|
CompactionStyle::kCompactionStyleUniversal);
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"PickCompactionToReduceSortedRunsReturnNonnullptr",
|
|
|
|
[&](void* /*arg*/) {
|
|
|
|
compaction_path_sync_point_called_.store(true);
|
|
|
|
});
|
|
|
|
} else if (compaction_path_to_test == "PickDeleteTriggeredCompaction") {
|
|
|
|
assert(options_.compaction_style ==
|
|
|
|
CompactionStyle::kCompactionStyleUniversal);
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"PickDeleteTriggeredCompactionReturnNonnullptr", [&](void* /*arg*/) {
|
|
|
|
compaction_path_sync_point_called_.store(true);
|
|
|
|
});
|
|
|
|
} else if ((compaction_path_to_test == "FindIntraL0Compaction" ||
|
|
|
|
compaction_path_to_test == "CompactRange") &&
|
|
|
|
options_.compaction_style ==
|
|
|
|
CompactionStyle::kCompactionStyleFIFO) {
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"FindIntraL0Compaction", [&](void* /*arg*/) {
|
|
|
|
compaction_path_sync_point_called_.store(true);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SyncPointsCalled() { return compaction_path_sync_point_called_.load(); }
|
|
|
|
|
|
|
|
void DisableSyncPoints() {
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the largest seqno of the latest L0 file based on file number
|
|
|
|
SequenceNumber GetLatestL0FileLargestSeqnoHelper() {
|
|
|
|
VersionSet* const versions = dbfull()->GetVersionSet();
|
|
|
|
assert(versions);
|
|
|
|
ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
|
|
|
|
assert(cfd);
|
|
|
|
Version* const current = cfd->current();
|
|
|
|
assert(current);
|
|
|
|
VersionStorageInfo* const storage_info = current->storage_info();
|
|
|
|
assert(storage_info);
|
|
|
|
const std::vector<FileMetaData*> level0_files = storage_info->LevelFiles(0);
|
|
|
|
assert(level0_files.size() >= 1);
|
|
|
|
|
|
|
|
uint64_t latest_file_num = 0;
|
|
|
|
uint64_t latest_file_largest_seqno = 0;
|
|
|
|
for (FileMetaData* f : level0_files) {
|
|
|
|
if (f->fd.GetNumber() > latest_file_num) {
|
|
|
|
latest_file_num = f->fd.GetNumber();
|
|
|
|
latest_file_largest_seqno = f->fd.largest_seqno;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return latest_file_largest_seqno;
|
|
|
|
}
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
protected:
|
|
|
|
Options options_;
|
|
|
|
|
|
|
|
private:
|
|
|
|
const Snapshot* snapshot_ = nullptr;
|
|
|
|
std::atomic<bool> compaction_path_sync_point_called_;
|
|
|
|
std::shared_ptr<test::SleepingBackgroundTask> sleeping_task_;
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTestL0FilesMisorderCorruption,
|
|
|
|
FlushAfterIntraL0LevelCompactionWithIngestedFile) {
|
|
|
|
SetupOptions(CompactionStyle::kCompactionStyleLevel, "");
|
|
|
|
DestroyAndReopen(options_);
|
|
|
|
// Prevents trivial move
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
for (int i = 0; i < 10; ++i) {
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
ASSERT_OK(Put(Key(i), "")); // Prevents trivial move
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
Compact("", Key(99));
|
|
|
|
ASSERT_EQ(0, NumTableFilesAtLevel(0));
|
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
// To get accurate NumTableFilesAtLevel(0) when the number reaches
|
|
|
|
// options_.level0_file_num_compaction_trigger
|
|
|
|
PauseCompactionThread();
|
|
|
|
|
|
|
|
// To create below LSM tree
|
|
|
|
// (key:value@n indicates key-value pair has seqno "n", L0 is sorted):
|
|
|
|
//
|
|
|
|
// memtable: m1[ 5:new@12 .. 1:new@8, 0:new@7]
|
|
|
|
// L0: s6[6:new@13], s5[5:old@6] ... s1[1:old@2],s0[0:old@1]
|
|
|
|
//
|
|
|
|
// (1) Make 6 L0 sst (i.e, s0 - s5)
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
for (int i = 0; i < 6; ++i) {
|
|
|
|
if (i % 2 == 0) {
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
IngestOneKeyValue(dbfull(), Key(i), "old", options_);
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
} else {
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
ASSERT_OK(Put(Key(i), "old"));
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_EQ(6, NumTableFilesAtLevel(0));
|
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
// (2) Create m1
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
for (int i = 0; i < 6; ++i) {
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
ASSERT_OK(Put(Key(i), "new"));
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
}
|
|
|
|
ASSERT_EQ(6, NumTableFilesAtLevel(0));
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
|
|
|
|
// (3) Ingest file (i.e, s6) to trigger IntraL0Compaction()
|
|
|
|
for (int i = 6; i < 7; ++i) {
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
ASSERT_EQ(i, NumTableFilesAtLevel(0));
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
IngestOneKeyValue(dbfull(), Key(i), "new", options_);
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
}
|
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
SetupSyncPoints("FindIntraL0Compaction");
|
|
|
|
ResumeCompactionThread();
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
ASSERT_TRUE(SyncPointsCalled());
|
|
|
|
DisableSyncPoints();
|
|
|
|
|
|
|
|
// After compaction, we have LSM tree:
|
|
|
|
//
|
|
|
|
// memtable: m1[ 5:new@12 .. 1:new@8, 0:new@7]
|
|
|
|
// L0: s7[6:new@13, 5:old@6 .. 0:old@1]
|
|
|
|
ASSERT_EQ(1, NumTableFilesAtLevel(0));
|
|
|
|
SequenceNumber compact_output_file_largest_seqno =
|
|
|
|
GetLatestL0FileLargestSeqnoHelper();
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
// After flush, we have LSM tree:
|
|
|
|
//
|
|
|
|
// L0: s8[5:new@12 .. 0:new@7],s7[6:new@13, 5:old@5 .. 0:old@1]
|
|
|
|
ASSERT_EQ(2, NumTableFilesAtLevel(0));
|
|
|
|
SequenceNumber flushed_file_largest_seqno =
|
|
|
|
GetLatestL0FileLargestSeqnoHelper();
|
|
|
|
|
|
|
|
// To verify there isn't any file misorder leading to returning a old value
|
|
|
|
// of Key(0) - Key(5) , which is caused by flushed table s8 has a
|
|
|
|
// smaller largest seqno than the compaction output file s7's largest seqno
|
|
|
|
// while the flushed table has the newer version of the values than the
|
|
|
|
// compaction output file's.
|
|
|
|
ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno);
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
for (int i = 0; i < 6; ++i) {
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
ASSERT_EQ("new", Get(Key(i)));
|
|
|
|
}
|
|
|
|
for (int i = 6; i < 7; ++i) {
|
|
|
|
ASSERT_EQ("new", Get(Key(i)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTestL0FilesMisorderCorruption,
|
|
|
|
FlushAfterIntraL0UniversalCompactionWithIngestedFile) {
|
|
|
|
for (const std::string compaction_path_to_test :
|
|
|
|
{"PickPeriodicCompaction", "PickCompactionToReduceSizeAmp",
|
|
|
|
"PickCompactionToReduceSortedRuns", "PickDeleteTriggeredCompaction"}) {
|
|
|
|
SetupOptions(CompactionStyle::kCompactionStyleUniversal,
|
|
|
|
compaction_path_to_test);
|
|
|
|
DestroyAndReopen(options_);
|
|
|
|
|
|
|
|
// To get accurate NumTableFilesAtLevel(0) when the number reaches
|
|
|
|
// options_.level0_file_num_compaction_trigger
|
|
|
|
PauseCompactionThread();
|
|
|
|
|
|
|
|
// To create below LSM tree
|
|
|
|
// (key:value@n indicates key-value pair has seqno "n", L0 is sorted):
|
|
|
|
//
|
|
|
|
// memtable: m1 [ k2:new@8, k1:new@7]
|
|
|
|
// L0: s4[k9:dummy@10], s3[k8:dummy@9],
|
|
|
|
// s2[k7:old@6, k6:old@5].. s0[k3:old@2, k1:old@1]
|
|
|
|
//
|
|
|
|
// (1) Create 3 existing SST file (i.e, s0 - s2)
|
|
|
|
ASSERT_OK(Put("k1", "old"));
|
|
|
|
ASSERT_OK(Put("k3", "old"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_EQ(1, NumTableFilesAtLevel(0));
|
|
|
|
ASSERT_OK(Put("k4", "old"));
|
|
|
|
ASSERT_OK(Put("k5", "old"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_EQ(2, NumTableFilesAtLevel(0));
|
|
|
|
ASSERT_OK(Put("k6", "old"));
|
|
|
|
ASSERT_OK(Put("k7", "old"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_EQ(3, NumTableFilesAtLevel(0));
|
|
|
|
|
|
|
|
// (2) Create m1. Noted that it contains a overlaped key with s0
|
|
|
|
ASSERT_OK(Put("k1", "new")); // overlapped key
|
|
|
|
ASSERT_OK(Put("k2", "new"));
|
|
|
|
|
|
|
|
// (3) Ingest two SST files s3, s4
|
|
|
|
IngestOneKeyValue(dbfull(), "k8", "dummy", options_);
|
|
|
|
IngestOneKeyValue(dbfull(), "k9", "dummy", options_);
|
|
|
|
// Up to now, L0 contains s0 - s4
|
|
|
|
ASSERT_EQ(5, NumTableFilesAtLevel(0));
|
|
|
|
|
|
|
|
if (compaction_path_to_test == "PickPeriodicCompaction") {
|
|
|
|
AddFilesMarkedForPeriodicCompaction(5);
|
|
|
|
} else if (compaction_path_to_test == "PickDeleteTriggeredCompaction") {
|
|
|
|
AddFilesMarkedForCompaction(5);
|
|
|
|
}
|
|
|
|
|
|
|
|
SetupSyncPoints(compaction_path_to_test);
|
|
|
|
ResumeCompactionThread();
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_TRUE(SyncPointsCalled())
|
|
|
|
<< "failed for compaction path to test: " << compaction_path_to_test;
|
|
|
|
DisableSyncPoints();
|
|
|
|
|
|
|
|
// After compaction, we have LSM tree:
|
|
|
|
//
|
|
|
|
// memtable: m1[ k2:new@8, k1:new@7]
|
|
|
|
// L0: s5[k9:dummy@10, k8@dummy@9, k7:old@6 .. k3:old@2, k1:old@1]
|
|
|
|
ASSERT_EQ(1, NumTableFilesAtLevel(0))
|
|
|
|
<< "failed for compaction path to test: " << compaction_path_to_test;
|
|
|
|
SequenceNumber compact_output_file_largest_seqno =
|
|
|
|
GetLatestL0FileLargestSeqnoHelper();
|
|
|
|
|
|
|
|
ASSERT_OK(Flush()) << "failed for compaction path to test: "
|
|
|
|
<< compaction_path_to_test;
|
|
|
|
// After flush, we have LSM tree:
|
|
|
|
//
|
|
|
|
// L0: s6[k2:new@8, k1:new@7],
|
|
|
|
// s5[k9:dummy@10, k8@dummy@9, k7:old@6 .. k3:old@2, k1:old@1]
|
|
|
|
ASSERT_EQ(2, NumTableFilesAtLevel(0))
|
|
|
|
<< "failed for compaction path to test: " << compaction_path_to_test;
|
|
|
|
SequenceNumber flushed_file_largest_seqno =
|
|
|
|
GetLatestL0FileLargestSeqnoHelper();
|
|
|
|
|
|
|
|
// To verify there isn't any file misorder leading to returning a old
|
|
|
|
// value of "k1" , which is caused by flushed table s6 has a
|
|
|
|
// smaller largest seqno than the compaction output file s5's largest seqno
|
|
|
|
// while the flushed table has the newer version of the value
|
|
|
|
// than the compaction output file's.
|
|
|
|
ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno)
|
|
|
|
<< "failed for compaction path to test: " << compaction_path_to_test;
|
|
|
|
EXPECT_EQ(Get("k1"), "new")
|
|
|
|
<< "failed for compaction path to test: " << compaction_path_to_test;
|
|
|
|
}
|
|
|
|
|
|
|
|
Destroy(options_);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTestL0FilesMisorderCorruption,
|
|
|
|
FlushAfterIntraL0FIFOCompactionWithIngestedFile) {
|
|
|
|
for (const std::string compaction_path_to_test : {"FindIntraL0Compaction"}) {
|
|
|
|
SetupOptions(CompactionStyle::kCompactionStyleFIFO,
|
|
|
|
compaction_path_to_test);
|
|
|
|
DestroyAndReopen(options_);
|
|
|
|
|
|
|
|
// To create below LSM tree
|
|
|
|
// (key:value@n indicates key-value pair has seqno "n", L0 is sorted):
|
|
|
|
//
|
|
|
|
// memtable: m1 [ k2:new@4, k1:new@3]
|
|
|
|
// L0: s2[k5:dummy@6], s1[k4:dummy@5], s0[k3:old@2, k1:old@1]
|
|
|
|
//
|
|
|
|
// (1) Create an existing SST file s0
|
|
|
|
ASSERT_OK(Put("k1", "old"));
|
|
|
|
ASSERT_OK(Put("k3", "old"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_EQ(1, NumTableFilesAtLevel(0));
|
|
|
|
|
|
|
|
// (2) Create memtable m1. Noted that it contains a overlaped key with s0
|
|
|
|
ASSERT_OK(Put("k1", "new")); // overlapped key
|
|
|
|
ASSERT_OK(Put("k2", "new"));
|
|
|
|
|
|
|
|
// To get accurate NumTableFilesAtLevel(0) when the number reaches
|
|
|
|
// options_.level0_file_num_compaction_trigger
|
|
|
|
PauseCompactionThread();
|
|
|
|
|
|
|
|
// (3) Ingest two SST files s1, s2
|
|
|
|
IngestOneKeyValue(dbfull(), "k4", "dummy", options_);
|
|
|
|
IngestOneKeyValue(dbfull(), "k5", "dummy", options_);
|
|
|
|
// Up to now, L0 contains s0, s1, s2
|
|
|
|
ASSERT_EQ(3, NumTableFilesAtLevel(0));
|
|
|
|
|
|
|
|
SetupSyncPoints(compaction_path_to_test);
|
|
|
|
ResumeCompactionThread();
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_TRUE(SyncPointsCalled())
|
|
|
|
<< "failed for compaction path to test: " << compaction_path_to_test;
|
|
|
|
DisableSyncPoints();
|
|
|
|
// After compaction, we have LSM tree:
|
|
|
|
//
|
|
|
|
// memtable: m1 [ k2:new@4, k1:new@3]
|
|
|
|
// L0: s3[k5:dummy@6, k4:dummy@5, k3:old@2, k1:old@1]
|
|
|
|
ASSERT_EQ(1, NumTableFilesAtLevel(0))
|
|
|
|
<< "failed for compaction path to test: " << compaction_path_to_test;
|
|
|
|
SequenceNumber compact_output_file_largest_seqno =
|
|
|
|
GetLatestL0FileLargestSeqnoHelper();
|
|
|
|
|
|
|
|
ASSERT_OK(Flush()) << "failed for compaction path to test: "
|
|
|
|
<< compaction_path_to_test;
|
|
|
|
// After flush, we have LSM tree:
|
|
|
|
//
|
|
|
|
// L0: s4[k2:new@4, k1:new@3], s3[k5:dummy@6, k4:dummy@5, k3:old@2,
|
|
|
|
// k1:old@1]
|
|
|
|
ASSERT_EQ(2, NumTableFilesAtLevel(0))
|
|
|
|
<< "failed for compaction path to test: " << compaction_path_to_test;
|
|
|
|
SequenceNumber flushed_file_largest_seqno =
|
|
|
|
GetLatestL0FileLargestSeqnoHelper();
|
|
|
|
|
|
|
|
// To verify there isn't any file misorder leading to returning a old
|
|
|
|
// value of "k1" , which is caused by flushed table s4 has a
|
|
|
|
// smaller largest seqno than the compaction output file s3's largest seqno
|
|
|
|
// while the flushed table has the newer version of the value
|
|
|
|
// than the compaction output file's.
|
|
|
|
ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno)
|
|
|
|
<< "failed for compaction path to test: " << compaction_path_to_test;
|
|
|
|
EXPECT_EQ(Get("k1"), "new")
|
|
|
|
<< "failed for compaction path to test: " << compaction_path_to_test;
|
|
|
|
}
|
|
|
|
|
|
|
|
Destroy(options_);
|
|
|
|
}
|
|
|
|
|
|
|
|
class DBCompactionTestL0FilesMisorderCorruptionWithParam
|
|
|
|
: public DBCompactionTestL0FilesMisorderCorruption,
|
|
|
|
public testing::WithParamInterface<CompactionStyle> {
|
|
|
|
public:
|
|
|
|
DBCompactionTestL0FilesMisorderCorruptionWithParam()
|
|
|
|
: DBCompactionTestL0FilesMisorderCorruption() {}
|
|
|
|
};
|
|
|
|
|
|
|
|
// TODO: add `CompactionStyle::kCompactionStyleLevel` to testing parameter,
|
|
|
|
// which requires careful unit test
|
|
|
|
// design for ingesting file to L0 and CompactRange()/CompactFile() to L0
|
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
DBCompactionTestL0FilesMisorderCorruptionWithParam,
|
|
|
|
DBCompactionTestL0FilesMisorderCorruptionWithParam,
|
|
|
|
::testing::Values(CompactionStyle::kCompactionStyleUniversal,
|
|
|
|
CompactionStyle::kCompactionStyleFIFO));
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestL0FilesMisorderCorruptionWithParam,
|
|
|
|
FlushAfterIntraL0CompactFileWithIngestedFile) {
|
|
|
|
SetupOptions(GetParam(), "CompactFile");
|
|
|
|
DestroyAndReopen(options_);
|
|
|
|
|
|
|
|
// To create below LSM tree
|
|
|
|
// (key:value@n indicates key-value pair has seqno "n", L0 is sorted):
|
|
|
|
//
|
|
|
|
// memtable: m1 [ k2:new@4, k1:new@3]
|
|
|
|
// L0: s2[k5:dummy@6], s1[k4:dummy@5], s0[k3:old@2, k1:old@1]
|
|
|
|
//
|
|
|
|
// (1) Create an existing SST file s0
|
|
|
|
ASSERT_OK(Put("k1", "old"));
|
|
|
|
ASSERT_OK(Put("k3", "old"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_EQ(1, NumTableFilesAtLevel(0));
|
|
|
|
|
|
|
|
// (2) Create memtable m1. Noted that it contains a overlaped key with s0
|
|
|
|
ASSERT_OK(Put("k1", "new")); // overlapped key
|
|
|
|
ASSERT_OK(Put("k2", "new"));
|
|
|
|
|
|
|
|
// (3) Ingest two SST files s1, s2
|
|
|
|
IngestOneKeyValue(dbfull(), "k4", "dummy", options_);
|
|
|
|
IngestOneKeyValue(dbfull(), "k5", "dummy", options_);
|
|
|
|
// Up to now, L0 contains s0, s1, s2
|
|
|
|
ASSERT_EQ(3, NumTableFilesAtLevel(0));
|
|
|
|
|
|
|
|
ColumnFamilyMetaData cf_meta_data;
|
|
|
|
db_->GetColumnFamilyMetaData(&cf_meta_data);
|
|
|
|
ASSERT_EQ(cf_meta_data.levels[0].files.size(), 3);
|
|
|
|
std::vector<std::string> input_files;
|
|
|
|
for (const auto& file : cf_meta_data.levels[0].files) {
|
|
|
|
input_files.push_back(file.name);
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
}
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
ASSERT_EQ(input_files.size(), 3);
|
|
|
|
|
|
|
|
Status s = db_->CompactFiles(CompactionOptions(), input_files, 0);
|
|
|
|
// After compaction, we have LSM tree:
|
|
|
|
//
|
|
|
|
// memtable: m1 [ k2:new@4, k1:new@3]
|
|
|
|
// L0: s3[k5:dummy@6, k4:dummy@5, k3:old@2, k1:old@1]
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(1, NumTableFilesAtLevel(0));
|
|
|
|
SequenceNumber compact_output_file_largest_seqno =
|
|
|
|
GetLatestL0FileLargestSeqnoHelper();
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
// After flush, we have LSM tree:
|
|
|
|
//
|
|
|
|
// L0: s4[k2:new@4, k1:new@3], s3[k5:dummy@6, k4:dummy@5, k3:old@2,
|
|
|
|
// k1:old@1]
|
|
|
|
ASSERT_EQ(2, NumTableFilesAtLevel(0));
|
|
|
|
SequenceNumber flushed_file_largest_seqno =
|
|
|
|
GetLatestL0FileLargestSeqnoHelper();
|
|
|
|
|
|
|
|
// To verify there isn't any file misorder leading to returning a old value
|
|
|
|
// of "1" , which is caused by flushed table s4 has a smaller
|
|
|
|
// largest seqno than the compaction output file s3's largest seqno while the
|
|
|
|
// flushed table has the newer version of the value than the
|
|
|
|
// compaction output file's.
|
|
|
|
ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno);
|
|
|
|
EXPECT_EQ(Get("k1"), "new");
|
|
|
|
|
|
|
|
Destroy(options_);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestL0FilesMisorderCorruptionWithParam,
|
|
|
|
FlushAfterIntraL0CompactRangeWithIngestedFile) {
|
|
|
|
SetupOptions(GetParam(), "CompactRange");
|
|
|
|
DestroyAndReopen(options_);
|
|
|
|
|
|
|
|
// To create below LSM tree
|
|
|
|
// (key:value@n indicates key-value pair has seqno "n", L0 is sorted):
|
|
|
|
//
|
|
|
|
// memtable: m1 [ k2:new@4, k1:new@3]
|
|
|
|
// L0: s2[k5:dummy@6], s1[k4:dummy@5], s0[k3:old@2, k1:old@1]
|
|
|
|
//
|
|
|
|
// (1) Create an existing SST file s0
|
|
|
|
ASSERT_OK(Put("k1", "old"));
|
|
|
|
ASSERT_OK(Put("k3", "old"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_EQ(1, NumTableFilesAtLevel(0));
|
|
|
|
|
|
|
|
// (2) Create memtable m1. Noted that it contains a overlaped key with s0
|
|
|
|
ASSERT_OK(Put("k1", "new")); // overlapped key
|
|
|
|
ASSERT_OK(Put("k2", "new"));
|
|
|
|
|
|
|
|
// (3) Ingest two SST files s1, s2
|
|
|
|
IngestOneKeyValue(dbfull(), "k4", "dummy", options_);
|
|
|
|
IngestOneKeyValue(dbfull(), "k5", "dummy", options_);
|
|
|
|
// Up to now, L0 contains s0, s1, s2
|
|
|
|
ASSERT_EQ(3, NumTableFilesAtLevel(0));
|
|
|
|
|
|
|
|
if (options_.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
|
|
|
|
SetupSyncPoints("CompactRange");
|
|
|
|
}
|
|
|
|
// `start` and `end` is carefully chosen so that compact range:
|
|
|
|
// (1) doesn't overlap with memtable therefore the memtable won't be flushed
|
|
|
|
// (2) should target at compacting s0 with s1 and s2
|
|
|
|
Slice start("k3"), end("k5");
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
|
|
|
|
// After compaction, we have LSM tree:
|
|
|
|
//
|
|
|
|
// memtable: m1 [ k2:new@4, k1:new@3]
|
|
|
|
// L0: s3[k5:dummy@6, k4:dummy@5, k3:old@2, k1:old@1]
|
|
|
|
if (options_.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
|
|
|
|
ASSERT_TRUE(SyncPointsCalled());
|
|
|
|
DisableSyncPoints();
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
}
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
ASSERT_EQ(1, NumTableFilesAtLevel(0));
|
|
|
|
SequenceNumber compact_output_file_largest_seqno =
|
|
|
|
GetLatestL0FileLargestSeqnoHelper();
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
// After flush, we have LSM tree:
|
|
|
|
//
|
|
|
|
// L0: s4[k2:new@4, k1:new@3], s3[k5:dummy@6, k4:dummy@5, k3:old@2,
|
|
|
|
// k1:old@1]
|
|
|
|
ASSERT_EQ(2, NumTableFilesAtLevel(0));
|
|
|
|
SequenceNumber flushed_file_largest_seqno =
|
|
|
|
GetLatestL0FileLargestSeqnoHelper();
|
|
|
|
|
|
|
|
// To verify there isn't any file misorder leading to returning a old value
|
|
|
|
// of "k1" , which is caused by flushed table s4 has a smaller
|
|
|
|
// largest seqno than the compaction output file s3's largest seqno while the
|
|
|
|
// flushed table has the newer version of the value than the
|
|
|
|
// compaction output file's.
|
|
|
|
ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno);
|
|
|
|
EXPECT_EQ(Get("k1"), "new");
|
|
|
|
|
|
|
|
Destroy(options_);
|
Fix corruption with intra-L0 on ingested files (#5958)
Summary:
## Problem Description
Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated.
* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
* If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`.
* If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
> s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno
So I skip pick sst file which was ingested in function `FindIntraL0Compaction `
## Reason
Here is my bug report: https://github.com/facebook/rocksdb/issues/5913
There are two situations that can cause the check to fail.
### First situation:
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.
- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.
- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.
### Secondary situaion
- First we have flushed many sst in L0, we call them [s1, s2, s3].
- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.
- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.
- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6.
- compacted 4@0 files to L0
- When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
- s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958
Differential Revision: D18601316
fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) {
|
|
|
|
constexpr int kSstNum = 10;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Generate some sst files on level 0 with sequence keys (no overlap)
|
|
|
|
for (int i = 0; i < kSstNum; i++) {
|
|
|
|
for (int j = 1; j < UCHAR_MAX; j++) {
|
|
|
|
auto key = std::string(kSstNum, '\0');
|
|
|
|
key[kSstNum - i] += static_cast<char>(j);
|
|
|
|
ASSERT_OK(Put(key, std::string(i % 1000, 'A')));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
|
|
|
|
ASSERT_EQ(std::to_string(kSstNum), FilesPerLevel(0));
|
|
|
|
|
|
|
|
auto cro = CompactRangeOptions();
|
|
|
|
cro.bottommost_level_compaction = bottommost_level_compaction_;
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
if (bottommost_level_compaction_ == BottommostLevelCompaction::kForce ||
|
|
|
|
bottommost_level_compaction_ ==
|
|
|
|
BottommostLevelCompaction::kForceOptimized) {
|
|
|
|
// Real compaction to compact all sst files from level 0 to 1 file on level
|
|
|
|
// 1
|
|
|
|
ASSERT_EQ("0,1", FilesPerLevel(0));
|
|
|
|
} else {
|
|
|
|
// Just trivial move from level 0 -> 1
|
|
|
|
ASSERT_EQ("0," + std::to_string(kSstNum), FilesPerLevel(0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
DBCompactionTestWithBottommostParam, DBCompactionTestWithBottommostParam,
|
|
|
|
::testing::Values(BottommostLevelCompaction::kSkip,
|
|
|
|
BottommostLevelCompaction::kIfHaveCompactionFilter,
|
|
|
|
BottommostLevelCompaction::kForce,
|
|
|
|
BottommostLevelCompaction::kForceOptimized));
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.max_subcompactions = 10;
|
|
|
|
options.target_file_size_base = 1 << 10; // 1KB
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
bool has_compaction = false;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
ASSERT_TRUE(compaction->max_subcompactions() == 10);
|
|
|
|
has_compaction = true;
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 10);
|
|
|
|
// Trigger compaction
|
|
|
|
for (int i = 0; i < 32; i++) {
|
|
|
|
for (int j = 0; j < 5000; j++) {
|
|
|
|
ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_TRUE(has_compaction);
|
|
|
|
|
|
|
|
has_compaction = false;
|
|
|
|
ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}}));
|
|
|
|
ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
ASSERT_TRUE(compaction->max_subcompactions() == 2);
|
|
|
|
has_compaction = true;
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// Trigger compaction
|
|
|
|
for (int i = 0; i < 32; i++) {
|
|
|
|
for (int j = 0; j < 5000; j++) {
|
|
|
|
ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_TRUE(has_compaction);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.max_subcompactions = 10;
|
|
|
|
options.compaction_style = kCompactionStyleUniversal;
|
|
|
|
options.target_file_size_base = 1 << 10; // 1KB
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
bool has_compaction = false;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
ASSERT_TRUE(compaction->max_subcompactions() == 10);
|
|
|
|
has_compaction = true;
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// Trigger compaction
|
|
|
|
for (int i = 0; i < 32; i++) {
|
|
|
|
for (int j = 0; j < 5000; j++) {
|
|
|
|
ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_TRUE(has_compaction);
|
|
|
|
has_compaction = false;
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}}));
|
|
|
|
ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2);
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
|
|
|
|
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
|
|
|
|
ASSERT_TRUE(compaction->max_subcompactions() == 2);
|
|
|
|
has_compaction = true;
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// Trigger compaction
|
|
|
|
for (int i = 0; i < 32; i++) {
|
|
|
|
for (int j = 0; j < 5000; j++) {
|
|
|
|
ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_TRUE(has_compaction);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(ChangeLevelConflictsWithAuto, TestConflict) {
|
|
|
|
// A `CompactRange()` may race with an automatic compaction, we'll need
|
|
|
|
// to make sure it doesn't corrupte the data.
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
ASSERT_OK(Put("foo", "v1"));
|
|
|
|
ASSERT_OK(Put("bar", "v1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
|
|
|
|
{
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = true;
|
|
|
|
cro.target_level = 2;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
}
|
|
|
|
ASSERT_EQ("0,0,1", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// Run a qury to refitting to level 1 while another thread writing to
|
|
|
|
// the same level.
|
|
|
|
SyncPoint::GetInstance()->LoadDependency({
|
|
|
|
// The first two dependencies ensure the foreground creates an L0 file
|
|
|
|
// between the background compaction's L0->L1 and its L1->L2.
|
|
|
|
{
|
|
|
|
"DBImpl::CompactRange:BeforeRefit:1",
|
|
|
|
"AutoCompactionFinished1",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"AutoCompactionFinished2",
|
|
|
|
"DBImpl::CompactRange:BeforeRefit:2",
|
|
|
|
},
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
std::thread auto_comp([&] {
|
|
|
|
TEST_SYNC_POINT("AutoCompactionFinished1");
|
|
|
|
ASSERT_OK(Put("bar", "v2"));
|
|
|
|
ASSERT_OK(Put("foo", "v2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put("bar", "v3"));
|
|
|
|
ASSERT_OK(Put("foo", "v3"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
TEST_SYNC_POINT("AutoCompactionFinished2");
|
|
|
|
});
|
|
|
|
|
|
|
|
{
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = true;
|
|
|
|
cro.target_level = GetParam() ? 1 : 0;
|
|
|
|
// This should return non-OK, but it's more important for the test to
|
|
|
|
// make sure that the DB is not corrupted.
|
|
|
|
ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
}
|
|
|
|
auto_comp.join();
|
|
|
|
// Refitting didn't happen.
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
|
|
|
|
// Write something to DB just make sure that consistency check didn't
|
|
|
|
// fail and make the DB readable.
|
|
|
|
}
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(ChangeLevelConflictsWithAuto,
|
|
|
|
ChangeLevelConflictsWithAuto, testing::Bool());
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, ChangeLevelCompactRangeConflictsWithManual) {
|
|
|
|
// A `CompactRange()` with `change_level == true` needs to execute its final
|
|
|
|
// step, `ReFitLevel()`, in isolation. Previously there was a bug where
|
|
|
|
// refitting could target the same level as an ongoing manual compaction,
|
|
|
|
// leading to overlapping files in that level.
|
|
|
|
//
|
|
|
|
// This test ensures that case is not possible by verifying any manual
|
|
|
|
// compaction issued during the `ReFitLevel()` phase fails with
|
|
|
|
// `Status::Incomplete`.
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.num_levels = 3;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// Setup an LSM with three levels populated.
|
|
|
|
Random rnd(301);
|
|
|
|
int key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
{
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = true;
|
|
|
|
cro.target_level = 2;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
}
|
|
|
|
ASSERT_EQ("0,0,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1,1,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// The background thread will refit L2->L1 while the
|
|
|
|
// foreground thread will try to simultaneously compact L0->L1.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
|
|
|
|
// The first two dependencies ensure the foreground creates an L0 file
|
|
|
|
// between the background compaction's L0->L1 and its L1->L2.
|
|
|
|
{
|
|
|
|
"DBImpl::RunManualCompaction()::1",
|
|
|
|
"DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
|
|
|
|
"PutFG",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
|
|
|
|
"FlushedFG",
|
|
|
|
"DBImpl::RunManualCompaction()::2",
|
|
|
|
},
|
|
|
|
// The next two dependencies ensure the foreground invokes
|
|
|
|
// `CompactRange()` while the background is refitting. The
|
|
|
|
// foreground's `CompactRange()` is guaranteed to attempt an L0->L1
|
|
|
|
// as we set it up with an empty memtable and a new L0 file.
|
|
|
|
{
|
|
|
|
"DBImpl::CompactRange:PreRefitLevel",
|
|
|
|
"DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
|
|
|
|
"CompactFG",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
|
|
|
|
"CompactedFG",
|
|
|
|
"DBImpl::CompactRange:PostRefitLevel",
|
|
|
|
},
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = true;
|
|
|
|
cro.target_level = 1;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
});
|
|
|
|
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:PutFG");
|
|
|
|
// Make sure we have something new to compact in the foreground.
|
|
|
|
// Note key 1 is carefully chosen as it ensures the file we create here
|
|
|
|
// overlaps with one of the files being refitted L2->L1 in the background.
|
|
|
|
// If we chose key 0, the file created here would not overlap.
|
|
|
|
ASSERT_OK(Put(Key(1), "val"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:FlushedFG");
|
|
|
|
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:CompactFG");
|
|
|
|
ASSERT_TRUE(dbfull()
|
|
|
|
->CompactRange(CompactRangeOptions(), nullptr, nullptr)
|
|
|
|
.IsIncomplete());
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
|
|
|
|
"CompactedFG");
|
|
|
|
refit_level_thread.join();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) {
|
|
|
|
// This test is added to ensure that RefitLevel() error paths are clearing
|
|
|
|
// internal flags and to test that subsequent valid RefitLevel() calls
|
|
|
|
// succeeds
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.num_levels = 3;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
ASSERT_EQ("", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// Setup an LSM with three levels populated.
|
|
|
|
Random rnd(301);
|
|
|
|
int key_idx = 0;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
ASSERT_EQ("1", FilesPerLevel(0));
|
|
|
|
{
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = true;
|
|
|
|
cro.target_level = 2;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
}
|
|
|
|
ASSERT_EQ("0,0,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
auto start_idx = key_idx;
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
GenerateNewFile(&rnd, &key_idx);
|
|
|
|
auto end_idx = key_idx - 1;
|
|
|
|
ASSERT_EQ("1,1,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// Next two CompactRange() calls are used to test exercise error paths within
|
|
|
|
// RefitLevel() before triggering a valid RefitLevel() call
|
|
|
|
|
|
|
|
// Trigger a refit to L1 first
|
|
|
|
{
|
|
|
|
std::string begin_string = Key(start_idx);
|
|
|
|
std::string end_string = Key(end_idx);
|
|
|
|
Slice begin(begin_string);
|
|
|
|
Slice end(end_string);
|
|
|
|
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = true;
|
|
|
|
cro.target_level = 1;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end));
|
|
|
|
}
|
|
|
|
ASSERT_EQ("0,3,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// Try a refit from L2->L1 - this should fail and exercise error paths in
|
|
|
|
// RefitLevel()
|
|
|
|
{
|
|
|
|
// Select key range that matches the bottom most level (L2)
|
|
|
|
std::string begin_string = Key(0);
|
|
|
|
std::string end_string = Key(start_idx - 1);
|
|
|
|
Slice begin(begin_string);
|
|
|
|
Slice end(end_string);
|
|
|
|
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = true;
|
|
|
|
cro.target_level = 1;
|
|
|
|
ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end));
|
|
|
|
}
|
|
|
|
ASSERT_EQ("0,3,2", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// Try a valid Refit request to ensure, the path is still working
|
|
|
|
{
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = true;
|
|
|
|
cro.target_level = 1;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
}
|
|
|
|
ASSERT_EQ("0,5", FilesPerLevel(0));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionWithBlob) {
|
|
|
|
Options options;
|
|
|
|
options.env = env_;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
constexpr char first_key[] = "first_key";
|
|
|
|
constexpr char second_key[] = "second_key";
|
|
|
|
constexpr char first_value[] = "first_value";
|
|
|
|
constexpr char second_value[] = "second_value";
|
|
|
|
constexpr char third_value[] = "third_value";
|
|
|
|
|
|
|
|
ASSERT_OK(Put(first_key, first_value));
|
|
|
|
ASSERT_OK(Put(second_key, first_value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(Put(first_key, second_value));
|
|
|
|
ASSERT_OK(Put(second_key, second_value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(Put(first_key, third_value));
|
|
|
|
ASSERT_OK(Put(second_key, third_value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
options.enable_blob_files = true;
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
constexpr Slice* begin = nullptr;
|
|
|
|
constexpr Slice* end = nullptr;
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
|
|
|
|
|
|
|
|
ASSERT_EQ(Get(first_key), third_value);
|
|
|
|
ASSERT_EQ(Get(second_key), third_value);
|
|
|
|
|
|
|
|
VersionSet* const versions = dbfull()->GetVersionSet();
|
|
|
|
assert(versions);
|
|
|
|
|
|
|
|
ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
|
|
|
|
ASSERT_NE(cfd, nullptr);
|
|
|
|
|
|
|
|
Version* const current = cfd->current();
|
|
|
|
ASSERT_NE(current, nullptr);
|
|
|
|
|
|
|
|
const VersionStorageInfo* const storage_info = current->storage_info();
|
|
|
|
ASSERT_NE(storage_info, nullptr);
|
|
|
|
|
|
|
|
const auto& l1_files = storage_info->LevelFiles(1);
|
|
|
|
ASSERT_EQ(l1_files.size(), 1);
|
|
|
|
|
|
|
|
const FileMetaData* const table_file = l1_files[0];
|
|
|
|
ASSERT_NE(table_file, nullptr);
|
|
|
|
|
|
|
|
const auto& blob_files = storage_info->GetBlobFiles();
|
|
|
|
ASSERT_EQ(blob_files.size(), 1);
|
|
|
|
|
Use a sorted vector instead of a map to store blob file metadata (#9526)
Summary:
The patch replaces `std::map` with a sorted `std::vector` for
`VersionStorageInfo::blob_files_` and preallocates the space
for the `vector` before saving the `BlobFileMetaData` into the
new `VersionStorageInfo` in `VersionBuilder::Rep::SaveBlobFilesTo`.
These changes reduce the time the DB mutex is held while
saving new `Version`s, and using a sorted `vector` also makes
lookups faster thanks to better memory locality.
In addition, the patch introduces helper methods
`VersionStorageInfo::GetBlobFileMetaData` and
`VersionStorageInfo::GetBlobFileMetaDataLB` that can be used by
clients to perform lookups in the `vector`, and does some general
cleanup in the parts of code where blob file metadata are used.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9526
Test Plan:
Ran `make check` and the crash test script for a while.
Performance was tested using a load-optimized benchmark (`fillseq` with vector memtable, no WAL) and small file sizes so that a significant number of files are produced:
```
numactl --interleave=all ./db_bench --benchmarks=fillseq --allow_concurrent_memtable_write=false --level0_file_num_compaction_trigger=4 --level0_slowdown_writes_trigger=20 --level0_stop_writes_trigger=30 --max_background_jobs=8 --max_write_buffer_number=8 --db=/data/ltamasi-dbbench --wal_dir=/data/ltamasi-dbbench --num=800000000 --num_levels=8 --key_size=20 --value_size=400 --block_size=8192 --cache_size=51539607552 --cache_numshardbits=6 --compression_max_dict_bytes=0 --compression_ratio=0.5 --compression_type=lz4 --bytes_per_sync=8388608 --cache_index_and_filter_blocks=1 --cache_high_pri_pool_ratio=0.5 --benchmark_write_rate_limit=0 --write_buffer_size=16777216 --target_file_size_base=16777216 --max_bytes_for_level_base=67108864 --verify_checksum=1 --delete_obsolete_files_period_micros=62914560 --max_bytes_for_level_multiplier=8 --statistics=0 --stats_per_interval=1 --stats_interval_seconds=20 --histogram=1 --memtablerep=skip_list --bloom_bits=10 --open_files=-1 --subcompactions=1 --compaction_style=0 --min_level_to_compress=3 --level_compaction_dynamic_level_bytes=true --pin_l0_filter_and_index_blocks_in_cache=1 --soft_pending_compaction_bytes_limit=167503724544 --hard_pending_compaction_bytes_limit=335007449088 --min_level_to_compress=0 --use_existing_db=0 --sync=0 --threads=1 --memtablerep=vector --allow_concurrent_memtable_write=false --disable_wal=1 --enable_blob_files=1 --blob_file_size=16777216 --min_blob_size=0 --blob_compression_type=lz4 --enable_blob_garbage_collection=1 --seed=<some value>
```
Final statistics before the patch:
```
Cumulative writes: 0 writes, 700M keys, 0 commit groups, 0.0 writes per commit group, ingest: 284.62 GB, 121.27 MB/s
Interval writes: 0 writes, 334K keys, 0 commit groups, 0.0 writes per commit group, ingest: 139.28 MB, 72.46 MB/s
```
With the patch:
```
Cumulative writes: 0 writes, 760M keys, 0 commit groups, 0.0 writes per commit group, ingest: 308.66 GB, 131.52 MB/s
Interval writes: 0 writes, 445K keys, 0 commit groups, 0.0 writes per commit group, ingest: 185.35 MB, 93.15 MB/s
```
Total time to complete the benchmark is 2611 seconds with the patch, down from 2986 secs.
Reviewed By: riversand963
Differential Revision: D34082728
Pulled By: ltamasi
fbshipit-source-id: fc598abf676dce436734d06bb9d2d99a26a004fc
3 years ago
|
|
|
const auto& blob_file = blob_files.front();
|
|
|
|
ASSERT_NE(blob_file, nullptr);
|
|
|
|
|
|
|
|
ASSERT_EQ(table_file->smallest.user_key(), first_key);
|
|
|
|
ASSERT_EQ(table_file->largest.user_key(), second_key);
|
|
|
|
ASSERT_EQ(table_file->oldest_blob_file_number,
|
|
|
|
blob_file->GetBlobFileNumber());
|
|
|
|
|
|
|
|
ASSERT_EQ(blob_file->GetTotalBlobCount(), 2);
|
|
|
|
|
|
|
|
const InternalStats* const internal_stats = cfd->internal_stats();
|
|
|
|
ASSERT_NE(internal_stats, nullptr);
|
|
|
|
|
|
|
|
const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
|
|
|
|
ASSERT_GE(compaction_stats.size(), 2);
|
|
|
|
ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
|
|
|
|
ASSERT_EQ(compaction_stats[1].bytes_written, table_file->fd.GetFileSize());
|
|
|
|
ASSERT_EQ(compaction_stats[1].bytes_written_blob,
|
|
|
|
blob_file->GetTotalBlobBytes());
|
|
|
|
ASSERT_EQ(compaction_stats[1].num_output_files, 1);
|
|
|
|
ASSERT_EQ(compaction_stats[1].num_output_files_blob, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
class DBCompactionTestBlobError
|
|
|
|
: public DBCompactionTest,
|
|
|
|
public testing::WithParamInterface<std::string> {
|
|
|
|
public:
|
Do not explicitly flush blob files when using the integrated BlobDB (#7892)
Summary:
In the original stacked BlobDB implementation, which writes blobs to blob files
immediately and treats blob files as logs, it makes sense to flush the file after
writing each blob to protect against process crashes; however, in the integrated
implementation, which builds blob files in the background jobs, this unnecessarily
reduces performance. This patch fixes this by simply adding a `do_flush` flag to
`BlobLogWriter`, which is set to `true` by the stacked implementation and to `false`
by the new code. Note: the change itself is trivial but the tests needed some work;
since in the new implementation, blobs are now buffered, adding a blob to
`BlobFileBuilder` is no longer guaranteed to result in an actual I/O. Therefore, we can
no longer rely on `FaultInjectionTestEnv` when testing failure cases; instead, we
manipulate the return values of I/O methods directly using `SyncPoint`s.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7892
Test Plan: `make check`
Reviewed By: jay-zhuang
Differential Revision: D26022814
Pulled By: ltamasi
fbshipit-source-id: b3dce419f312137fa70d84cdd9b908fd5d60d8cd
4 years ago
|
|
|
DBCompactionTestBlobError() : sync_point_(GetParam()) {}
|
|
|
|
|
|
|
|
std::string sync_point_;
|
|
|
|
};
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobError, DBCompactionTestBlobError,
|
|
|
|
::testing::ValuesIn(std::vector<std::string>{
|
|
|
|
"BlobFileBuilder::WriteBlobToFile:AddRecord",
|
|
|
|
"BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestBlobError, CompactionError) {
|
|
|
|
Options options;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.env = env_;
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
constexpr char first_key[] = "first_key";
|
|
|
|
constexpr char second_key[] = "second_key";
|
|
|
|
constexpr char first_value[] = "first_value";
|
|
|
|
constexpr char second_value[] = "second_value";
|
|
|
|
constexpr char third_value[] = "third_value";
|
|
|
|
|
|
|
|
ASSERT_OK(Put(first_key, first_value));
|
|
|
|
ASSERT_OK(Put(second_key, first_value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(Put(first_key, second_value));
|
|
|
|
ASSERT_OK(Put(second_key, second_value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(Put(first_key, third_value));
|
|
|
|
ASSERT_OK(Put(second_key, third_value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
options.enable_blob_files = true;
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
|
Do not explicitly flush blob files when using the integrated BlobDB (#7892)
Summary:
In the original stacked BlobDB implementation, which writes blobs to blob files
immediately and treats blob files as logs, it makes sense to flush the file after
writing each blob to protect against process crashes; however, in the integrated
implementation, which builds blob files in the background jobs, this unnecessarily
reduces performance. This patch fixes this by simply adding a `do_flush` flag to
`BlobLogWriter`, which is set to `true` by the stacked implementation and to `false`
by the new code. Note: the change itself is trivial but the tests needed some work;
since in the new implementation, blobs are now buffered, adding a blob to
`BlobFileBuilder` is no longer guaranteed to result in an actual I/O. Therefore, we can
no longer rely on `FaultInjectionTestEnv` when testing failure cases; instead, we
manipulate the return values of I/O methods directly using `SyncPoint`s.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7892
Test Plan: `make check`
Reviewed By: jay-zhuang
Differential Revision: D26022814
Pulled By: ltamasi
fbshipit-source-id: b3dce419f312137fa70d84cdd9b908fd5d60d8cd
4 years ago
|
|
|
SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
|
|
|
|
Status* const s = static_cast<Status*>(arg);
|
|
|
|
assert(s);
|
|
|
|
|
|
|
|
(*s) = Status::IOError(sync_point_);
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
constexpr Slice* begin = nullptr;
|
|
|
|
constexpr Slice* end = nullptr;
|
|
|
|
|
|
|
|
ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), begin, end).IsIOError());
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
VersionSet* const versions = dbfull()->GetVersionSet();
|
|
|
|
assert(versions);
|
|
|
|
|
|
|
|
ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
|
|
|
|
ASSERT_NE(cfd, nullptr);
|
|
|
|
|
|
|
|
Version* const current = cfd->current();
|
|
|
|
ASSERT_NE(current, nullptr);
|
|
|
|
|
|
|
|
const VersionStorageInfo* const storage_info = current->storage_info();
|
|
|
|
ASSERT_NE(storage_info, nullptr);
|
|
|
|
|
|
|
|
const auto& l1_files = storage_info->LevelFiles(1);
|
|
|
|
ASSERT_TRUE(l1_files.empty());
|
|
|
|
|
|
|
|
const auto& blob_files = storage_info->GetBlobFiles();
|
|
|
|
ASSERT_TRUE(blob_files.empty());
|
|
|
|
|
|
|
|
const InternalStats* const internal_stats = cfd->internal_stats();
|
|
|
|
ASSERT_NE(internal_stats, nullptr);
|
|
|
|
|
|
|
|
const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
|
|
|
|
ASSERT_GE(compaction_stats.size(), 2);
|
|
|
|
|
|
|
|
if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") {
|
|
|
|
ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
|
|
|
|
ASSERT_EQ(compaction_stats[1].bytes_written, 0);
|
|
|
|
ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
|
|
|
|
ASSERT_EQ(compaction_stats[1].num_output_files, 0);
|
|
|
|
ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0);
|
|
|
|
} else {
|
|
|
|
// SST file writing succeeded; blob file writing failed (during Finish)
|
|
|
|
ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
|
|
|
|
ASSERT_GT(compaction_stats[1].bytes_written, 0);
|
|
|
|
ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
|
|
|
|
ASSERT_EQ(compaction_stats[1].num_output_files, 1);
|
|
|
|
ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Integrated blob garbage collection: relocate blobs (#7694)
Summary:
The patch adds basic garbage collection support to the integrated BlobDB
implementation. Valid blobs residing in the oldest blob files are relocated
as they are encountered during compaction. The threshold that determines
which blob files qualify is computed based on the configuration option
`blob_garbage_collection_age_cutoff`, which was introduced in https://github.com/facebook/rocksdb/issues/7661 .
Once a blob is retrieved for the purposes of relocation, it passes through the
same logic that extracts large values to blob files in general. This means that
if, for instance, the size threshold for key-value separation (`min_blob_size`)
got changed or writing blob files got disabled altogether, it is possible for the
value to be moved back into the LSM tree. In particular, one way to re-inline
all blob values if needed would be to perform a full manual compaction with
`enable_blob_files` set to `false`, `enable_blob_garbage_collection` set to
`true`, and `blob_file_garbage_collection_age_cutoff` set to `1.0`.
Some TODOs that I plan to address in separate PRs:
1) We'll have to measure the amount of new garbage in each blob file and log
`BlobFileGarbage` entries as part of the compaction job's `VersionEdit`.
(For the time being, blob files are cleaned up solely based on the
`oldest_blob_file_number` relationships.)
2) When compression is used for blobs, the compression type hasn't changed,
and the blob still qualifies for being written to a blob file, we can simply copy
the compressed blob to the new file instead of going through decompression
and compression.
3) We need to update the formula for computing write amplification to account
for the amount of data read from blob files as part of GC.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7694
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D25069663
Pulled By: ltamasi
fbshipit-source-id: bdfa8feb09afcf5bca3b4eba2ba72ce2f15cd06a
4 years ago
|
|
|
class DBCompactionTestBlobGC
|
|
|
|
: public DBCompactionTest,
|
|
|
|
public testing::WithParamInterface<std::tuple<double, bool>> {
|
|
|
|
public:
|
|
|
|
DBCompactionTestBlobGC()
|
|
|
|
: blob_gc_age_cutoff_(std::get<0>(GetParam())),
|
|
|
|
updated_enable_blob_files_(std::get<1>(GetParam())) {}
|
|
|
|
|
|
|
|
double blob_gc_age_cutoff_;
|
|
|
|
bool updated_enable_blob_files_;
|
|
|
|
};
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobGC, DBCompactionTestBlobGC,
|
|
|
|
::testing::Combine(::testing::Values(0.0, 0.5, 1.0),
|
|
|
|
::testing::Bool()));
|
|
|
|
|
|
|
|
TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGCOverrides) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.enable_blob_files = true;
|
|
|
|
options.blob_file_size = 32; // one blob per file
|
|
|
|
options.enable_blob_garbage_collection = true;
|
|
|
|
options.blob_garbage_collection_age_cutoff = 0;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
for (int i = 0; i < 128; i += 2) {
|
|
|
|
ASSERT_OK(Put("key" + std::to_string(i), "value" + std::to_string(i)));
|
|
|
|
ASSERT_OK(
|
|
|
|
Put("key" + std::to_string(i + 1), "value" + std::to_string(i + 1)));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
|
|
|
|
ASSERT_EQ(original_blob_files.size(), 128);
|
|
|
|
|
|
|
|
// Note: turning off enable_blob_files before the compaction results in
|
|
|
|
// garbage collected values getting inlined.
|
|
|
|
ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
|
|
|
|
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.blob_garbage_collection_policy = BlobGarbageCollectionPolicy::kForce;
|
|
|
|
cro.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_;
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
|
|
|
|
// Check that the GC stats are correct
|
|
|
|
{
|
|
|
|
VersionSet* const versions = dbfull()->GetVersionSet();
|
|
|
|
assert(versions);
|
|
|
|
assert(versions->GetColumnFamilySet());
|
|
|
|
|
|
|
|
ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
|
|
|
|
assert(cfd);
|
|
|
|
|
|
|
|
const InternalStats* const internal_stats = cfd->internal_stats();
|
|
|
|
assert(internal_stats);
|
|
|
|
|
|
|
|
const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
|
|
|
|
ASSERT_GE(compaction_stats.size(), 2);
|
|
|
|
|
|
|
|
ASSERT_GE(compaction_stats[1].bytes_read_blob, 0);
|
|
|
|
ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
const size_t cutoff_index = static_cast<size_t>(
|
|
|
|
cro.blob_garbage_collection_age_cutoff * original_blob_files.size());
|
|
|
|
const size_t expected_num_files = original_blob_files.size() - cutoff_index;
|
|
|
|
|
|
|
|
const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
|
|
|
|
|
|
|
|
ASSERT_EQ(new_blob_files.size(), expected_num_files);
|
|
|
|
|
|
|
|
// Original blob files below the cutoff should be gone, original blob files
|
|
|
|
// at or above the cutoff should be still there
|
|
|
|
for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
|
|
|
|
ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < 128; ++i) {
|
|
|
|
ASSERT_EQ(Get("key" + std::to_string(i)), "value" + std::to_string(i));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Integrated blob garbage collection: relocate blobs (#7694)
Summary:
The patch adds basic garbage collection support to the integrated BlobDB
implementation. Valid blobs residing in the oldest blob files are relocated
as they are encountered during compaction. The threshold that determines
which blob files qualify is computed based on the configuration option
`blob_garbage_collection_age_cutoff`, which was introduced in https://github.com/facebook/rocksdb/issues/7661 .
Once a blob is retrieved for the purposes of relocation, it passes through the
same logic that extracts large values to blob files in general. This means that
if, for instance, the size threshold for key-value separation (`min_blob_size`)
got changed or writing blob files got disabled altogether, it is possible for the
value to be moved back into the LSM tree. In particular, one way to re-inline
all blob values if needed would be to perform a full manual compaction with
`enable_blob_files` set to `false`, `enable_blob_garbage_collection` set to
`true`, and `blob_file_garbage_collection_age_cutoff` set to `1.0`.
Some TODOs that I plan to address in separate PRs:
1) We'll have to measure the amount of new garbage in each blob file and log
`BlobFileGarbage` entries as part of the compaction job's `VersionEdit`.
(For the time being, blob files are cleaned up solely based on the
`oldest_blob_file_number` relationships.)
2) When compression is used for blobs, the compression type hasn't changed,
and the blob still qualifies for being written to a blob file, we can simply copy
the compressed blob to the new file instead of going through decompression
and compression.
3) We need to update the formula for computing write amplification to account
for the amount of data read from blob files as part of GC.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7694
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D25069663
Pulled By: ltamasi
fbshipit-source-id: bdfa8feb09afcf5bca3b4eba2ba72ce2f15cd06a
4 years ago
|
|
|
TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGC) {
|
|
|
|
Options options;
|
|
|
|
options.env = env_;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.enable_blob_files = true;
|
|
|
|
options.blob_file_size = 32; // one blob per file
|
|
|
|
options.enable_blob_garbage_collection = true;
|
|
|
|
options.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_;
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
constexpr char first_key[] = "first_key";
|
|
|
|
constexpr char first_value[] = "first_value";
|
|
|
|
constexpr char second_key[] = "second_key";
|
|
|
|
constexpr char second_value[] = "second_value";
|
|
|
|
|
|
|
|
ASSERT_OK(Put(first_key, first_value));
|
|
|
|
ASSERT_OK(Put(second_key, second_value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
constexpr char third_key[] = "third_key";
|
|
|
|
constexpr char third_value[] = "third_value";
|
|
|
|
constexpr char fourth_key[] = "fourth_key";
|
|
|
|
constexpr char fourth_value[] = "fourth_value";
|
|
|
|
|
|
|
|
ASSERT_OK(Put(third_key, third_value));
|
|
|
|
ASSERT_OK(Put(fourth_key, fourth_value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
const std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
|
|
|
|
|
|
|
|
ASSERT_EQ(original_blob_files.size(), 4);
|
|
|
|
|
|
|
|
const size_t cutoff_index = static_cast<size_t>(
|
|
|
|
options.blob_garbage_collection_age_cutoff * original_blob_files.size());
|
|
|
|
|
|
|
|
// Note: turning off enable_blob_files before the compaction results in
|
|
|
|
// garbage collected values getting inlined.
|
|
|
|
size_t expected_number_of_files = original_blob_files.size();
|
|
|
|
|
|
|
|
if (!updated_enable_blob_files_) {
|
|
|
|
ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
|
|
|
|
|
|
|
|
expected_number_of_files -= cutoff_index;
|
|
|
|
}
|
|
|
|
|
|
|
|
constexpr Slice* begin = nullptr;
|
|
|
|
constexpr Slice* end = nullptr;
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
|
|
|
|
|
|
|
|
ASSERT_EQ(Get(first_key), first_value);
|
|
|
|
ASSERT_EQ(Get(second_key), second_value);
|
|
|
|
ASSERT_EQ(Get(third_key), third_value);
|
|
|
|
ASSERT_EQ(Get(fourth_key), fourth_value);
|
|
|
|
|
|
|
|
const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
|
|
|
|
|
|
|
|
ASSERT_EQ(new_blob_files.size(), expected_number_of_files);
|
|
|
|
|
|
|
|
// Original blob files below the cutoff should be gone, original blob files at
|
|
|
|
// or above the cutoff should be still there
|
|
|
|
for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
|
|
|
|
ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
VersionSet* const versions = dbfull()->GetVersionSet();
|
|
|
|
assert(versions);
|
|
|
|
assert(versions->GetColumnFamilySet());
|
|
|
|
|
|
|
|
ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
|
|
|
|
assert(cfd);
|
|
|
|
|
|
|
|
const InternalStats* const internal_stats = cfd->internal_stats();
|
|
|
|
assert(internal_stats);
|
|
|
|
|
|
|
|
const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
|
|
|
|
ASSERT_GE(compaction_stats.size(), 2);
|
|
|
|
|
|
|
|
if (blob_gc_age_cutoff_ > 0.0) {
|
|
|
|
ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
|
|
|
|
|
|
|
|
if (updated_enable_blob_files_) {
|
|
|
|
// GC relocated some blobs to new blob files
|
|
|
|
ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
|
|
|
|
ASSERT_EQ(compaction_stats[1].bytes_read_blob,
|
|
|
|
compaction_stats[1].bytes_written_blob);
|
|
|
|
} else {
|
|
|
|
// GC moved some blobs back to the LSM, no new blob files
|
|
|
|
ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
|
|
|
|
ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
|
|
|
|
}
|
Integrated blob garbage collection: relocate blobs (#7694)
Summary:
The patch adds basic garbage collection support to the integrated BlobDB
implementation. Valid blobs residing in the oldest blob files are relocated
as they are encountered during compaction. The threshold that determines
which blob files qualify is computed based on the configuration option
`blob_garbage_collection_age_cutoff`, which was introduced in https://github.com/facebook/rocksdb/issues/7661 .
Once a blob is retrieved for the purposes of relocation, it passes through the
same logic that extracts large values to blob files in general. This means that
if, for instance, the size threshold for key-value separation (`min_blob_size`)
got changed or writing blob files got disabled altogether, it is possible for the
value to be moved back into the LSM tree. In particular, one way to re-inline
all blob values if needed would be to perform a full manual compaction with
`enable_blob_files` set to `false`, `enable_blob_garbage_collection` set to
`true`, and `blob_file_garbage_collection_age_cutoff` set to `1.0`.
Some TODOs that I plan to address in separate PRs:
1) We'll have to measure the amount of new garbage in each blob file and log
`BlobFileGarbage` entries as part of the compaction job's `VersionEdit`.
(For the time being, blob files are cleaned up solely based on the
`oldest_blob_file_number` relationships.)
2) When compression is used for blobs, the compression type hasn't changed,
and the blob still qualifies for being written to a blob file, we can simply copy
the compressed blob to the new file instead of going through decompression
and compression.
3) We need to update the formula for computing write amplification to account
for the amount of data read from blob files as part of GC.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7694
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D25069663
Pulled By: ltamasi
fbshipit-source-id: bdfa8feb09afcf5bca3b4eba2ba72ce2f15cd06a
4 years ago
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionWithBlobGCError_CorruptIndex) {
|
|
|
|
Options options;
|
|
|
|
options.env = env_;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.enable_blob_files = true;
|
|
|
|
options.enable_blob_garbage_collection = true;
|
|
|
|
options.blob_garbage_collection_age_cutoff = 1.0;
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
constexpr char first_key[] = "first_key";
|
|
|
|
constexpr char first_value[] = "first_value";
|
|
|
|
ASSERT_OK(Put(first_key, first_value));
|
|
|
|
|
|
|
|
constexpr char second_key[] = "second_key";
|
|
|
|
constexpr char second_value[] = "second_value";
|
|
|
|
ASSERT_OK(Put(second_key, second_value));
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
constexpr char third_key[] = "third_key";
|
|
|
|
constexpr char third_value[] = "third_value";
|
|
|
|
ASSERT_OK(Put(third_key, third_value));
|
|
|
|
|
|
|
|
constexpr char fourth_key[] = "fourth_key";
|
|
|
|
constexpr char fourth_value[] = "fourth_value";
|
|
|
|
ASSERT_OK(Put(fourth_key, fourth_value));
|
Integrated blob garbage collection: relocate blobs (#7694)
Summary:
The patch adds basic garbage collection support to the integrated BlobDB
implementation. Valid blobs residing in the oldest blob files are relocated
as they are encountered during compaction. The threshold that determines
which blob files qualify is computed based on the configuration option
`blob_garbage_collection_age_cutoff`, which was introduced in https://github.com/facebook/rocksdb/issues/7661 .
Once a blob is retrieved for the purposes of relocation, it passes through the
same logic that extracts large values to blob files in general. This means that
if, for instance, the size threshold for key-value separation (`min_blob_size`)
got changed or writing blob files got disabled altogether, it is possible for the
value to be moved back into the LSM tree. In particular, one way to re-inline
all blob values if needed would be to perform a full manual compaction with
`enable_blob_files` set to `false`, `enable_blob_garbage_collection` set to
`true`, and `blob_file_garbage_collection_age_cutoff` set to `1.0`.
Some TODOs that I plan to address in separate PRs:
1) We'll have to measure the amount of new garbage in each blob file and log
`BlobFileGarbage` entries as part of the compaction job's `VersionEdit`.
(For the time being, blob files are cleaned up solely based on the
`oldest_blob_file_number` relationships.)
2) When compression is used for blobs, the compression type hasn't changed,
and the blob still qualifies for being written to a blob file, we can simply copy
the compressed blob to the new file instead of going through decompression
and compression.
3) We need to update the formula for computing write amplification to account
for the amount of data read from blob files as part of GC.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7694
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D25069663
Pulled By: ltamasi
fbshipit-source-id: bdfa8feb09afcf5bca3b4eba2ba72ce2f15cd06a
4 years ago
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"CompactionIterator::GarbageCollectBlobIfNeeded::TamperWithBlobIndex",
|
|
|
|
[](void* arg) {
|
|
|
|
Slice* const blob_index = static_cast<Slice*>(arg);
|
|
|
|
assert(blob_index);
|
|
|
|
assert(!blob_index->empty());
|
|
|
|
blob_index->remove_prefix(1);
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
Integrated blob garbage collection: relocate blobs (#7694)
Summary:
The patch adds basic garbage collection support to the integrated BlobDB
implementation. Valid blobs residing in the oldest blob files are relocated
as they are encountered during compaction. The threshold that determines
which blob files qualify is computed based on the configuration option
`blob_garbage_collection_age_cutoff`, which was introduced in https://github.com/facebook/rocksdb/issues/7661 .
Once a blob is retrieved for the purposes of relocation, it passes through the
same logic that extracts large values to blob files in general. This means that
if, for instance, the size threshold for key-value separation (`min_blob_size`)
got changed or writing blob files got disabled altogether, it is possible for the
value to be moved back into the LSM tree. In particular, one way to re-inline
all blob values if needed would be to perform a full manual compaction with
`enable_blob_files` set to `false`, `enable_blob_garbage_collection` set to
`true`, and `blob_file_garbage_collection_age_cutoff` set to `1.0`.
Some TODOs that I plan to address in separate PRs:
1) We'll have to measure the amount of new garbage in each blob file and log
`BlobFileGarbage` entries as part of the compaction job's `VersionEdit`.
(For the time being, blob files are cleaned up solely based on the
`oldest_blob_file_number` relationships.)
2) When compression is used for blobs, the compression type hasn't changed,
and the blob still qualifies for being written to a blob file, we can simply copy
the compressed blob to the new file instead of going through decompression
and compression.
3) We need to update the formula for computing write amplification to account
for the amount of data read from blob files as part of GC.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7694
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D25069663
Pulled By: ltamasi
fbshipit-source-id: bdfa8feb09afcf5bca3b4eba2ba72ce2f15cd06a
4 years ago
|
|
|
constexpr Slice* begin = nullptr;
|
|
|
|
constexpr Slice* end = nullptr;
|
|
|
|
|
|
|
|
ASSERT_TRUE(
|
|
|
|
db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
Integrated blob garbage collection: relocate blobs (#7694)
Summary:
The patch adds basic garbage collection support to the integrated BlobDB
implementation. Valid blobs residing in the oldest blob files are relocated
as they are encountered during compaction. The threshold that determines
which blob files qualify is computed based on the configuration option
`blob_garbage_collection_age_cutoff`, which was introduced in https://github.com/facebook/rocksdb/issues/7661 .
Once a blob is retrieved for the purposes of relocation, it passes through the
same logic that extracts large values to blob files in general. This means that
if, for instance, the size threshold for key-value separation (`min_blob_size`)
got changed or writing blob files got disabled altogether, it is possible for the
value to be moved back into the LSM tree. In particular, one way to re-inline
all blob values if needed would be to perform a full manual compaction with
`enable_blob_files` set to `false`, `enable_blob_garbage_collection` set to
`true`, and `blob_file_garbage_collection_age_cutoff` set to `1.0`.
Some TODOs that I plan to address in separate PRs:
1) We'll have to measure the amount of new garbage in each blob file and log
`BlobFileGarbage` entries as part of the compaction job's `VersionEdit`.
(For the time being, blob files are cleaned up solely based on the
`oldest_blob_file_number` relationships.)
2) When compression is used for blobs, the compression type hasn't changed,
and the blob still qualifies for being written to a blob file, we can simply copy
the compressed blob to the new file instead of going through decompression
and compression.
3) We need to update the formula for computing write amplification to account
for the amount of data read from blob files as part of GC.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7694
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D25069663
Pulled By: ltamasi
fbshipit-source-id: bdfa8feb09afcf5bca3b4eba2ba72ce2f15cd06a
4 years ago
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionWithBlobGCError_InlinedTTLIndex) {
|
|
|
|
constexpr uint64_t min_blob_size = 10;
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.env = env_;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.enable_blob_files = true;
|
|
|
|
options.min_blob_size = min_blob_size;
|
|
|
|
options.enable_blob_garbage_collection = true;
|
|
|
|
options.blob_garbage_collection_age_cutoff = 1.0;
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
constexpr char first_key[] = "first_key";
|
|
|
|
constexpr char first_value[] = "first_value";
|
|
|
|
ASSERT_OK(Put(first_key, first_value));
|
|
|
|
|
|
|
|
constexpr char second_key[] = "second_key";
|
|
|
|
constexpr char second_value[] = "second_value";
|
|
|
|
ASSERT_OK(Put(second_key, second_value));
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
constexpr char third_key[] = "third_key";
|
|
|
|
constexpr char third_value[] = "third_value";
|
|
|
|
ASSERT_OK(Put(third_key, third_value));
|
|
|
|
|
|
|
|
constexpr char fourth_key[] = "fourth_key";
|
|
|
|
constexpr char blob[] = "short";
|
|
|
|
static_assert(sizeof(short) - 1 < min_blob_size,
|
|
|
|
"Blob too long to be inlined");
|
|
|
|
|
|
|
|
// Fake an inlined TTL blob index.
|
|
|
|
std::string blob_index;
|
|
|
|
|
|
|
|
constexpr uint64_t expiration = 1234567890;
|
|
|
|
|
|
|
|
BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
ASSERT_OK(
|
|
|
|
WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index));
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
constexpr Slice* begin = nullptr;
|
|
|
|
constexpr Slice* end = nullptr;
|
|
|
|
|
|
|
|
ASSERT_TRUE(
|
|
|
|
db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionWithBlobGCError_IndexWithInvalidFileNumber) {
|
|
|
|
Options options;
|
|
|
|
options.env = env_;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.enable_blob_files = true;
|
|
|
|
options.enable_blob_garbage_collection = true;
|
|
|
|
options.blob_garbage_collection_age_cutoff = 1.0;
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
constexpr char first_key[] = "first_key";
|
|
|
|
constexpr char first_value[] = "first_value";
|
|
|
|
ASSERT_OK(Put(first_key, first_value));
|
|
|
|
|
|
|
|
constexpr char second_key[] = "second_key";
|
|
|
|
constexpr char second_value[] = "second_value";
|
|
|
|
ASSERT_OK(Put(second_key, second_value));
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
constexpr char third_key[] = "third_key";
|
|
|
|
constexpr char third_value[] = "third_value";
|
|
|
|
ASSERT_OK(Put(third_key, third_value));
|
|
|
|
|
|
|
|
constexpr char fourth_key[] = "fourth_key";
|
|
|
|
|
|
|
|
// Fake a blob index referencing a non-existent blob file.
|
|
|
|
std::string blob_index;
|
|
|
|
|
|
|
|
constexpr uint64_t blob_file_number = 1000;
|
|
|
|
constexpr uint64_t offset = 1234;
|
|
|
|
constexpr uint64_t size = 5678;
|
|
|
|
|
|
|
|
BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
|
|
|
|
kNoCompression);
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
ASSERT_OK(
|
|
|
|
WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index));
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
constexpr Slice* begin = nullptr;
|
|
|
|
constexpr Slice* end = nullptr;
|
|
|
|
|
|
|
|
ASSERT_TRUE(
|
|
|
|
db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) {
|
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
std::shared_ptr<FaultInjectionTestFS> fault_fs(
|
|
|
|
new FaultInjectionTestFS(FileSystem::Default()));
|
|
|
|
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.num_levels = 3;
|
|
|
|
options.env = fault_fs_env.get();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.checksum_handoff_file_types.Add(FileType::kTableFile);
|
|
|
|
Status s;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_OK(Put(Key(1), "value3"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
s = dbfull()->TEST_WaitForCompact();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
Destroy(options);
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// The hash does not match, compaction write fails
|
|
|
|
// fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
|
|
|
|
// Since the file system returns IOStatus::Corruption, it is an
|
|
|
|
// unrecoverable error.
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::FlushMemTable:FlushMemTableFinished",
|
|
|
|
"BackgroundCallCompaction:0"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BackgroundCallCompaction:0", [&](void*) {
|
|
|
|
fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ASSERT_OK(Put(Key(1), "value3"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
s = dbfull()->TEST_WaitForCompact();
|
|
|
|
ASSERT_EQ(s.severity(),
|
|
|
|
ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
Destroy(options);
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// The file system does not support checksum handoff. The check
|
|
|
|
// will be ignored.
|
|
|
|
fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_OK(Put(Key(1), "value3"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
s = dbfull()->TEST_WaitForCompact();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
|
|
|
|
// Each write will be similated as corrupted.
|
|
|
|
// Since the file system returns IOStatus::Corruption, it is an
|
|
|
|
// unrecoverable error.
|
|
|
|
fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::FlushMemTable:FlushMemTableFinished",
|
|
|
|
"BackgroundCallCompaction:0"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BackgroundCallCompaction:0",
|
|
|
|
[&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ASSERT_OK(Put(Key(1), "value3"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
s = dbfull()->TEST_WaitForCompact();
|
|
|
|
ASSERT_EQ(s.severity(),
|
|
|
|
ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
|
|
|
|
Destroy(options);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) {
|
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
std::shared_ptr<FaultInjectionTestFS> fault_fs(
|
|
|
|
new FaultInjectionTestFS(FileSystem::Default()));
|
|
|
|
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.num_levels = 3;
|
|
|
|
options.env = fault_fs_env.get();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
Status s;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_OK(Put(Key(1), "value3"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
s = dbfull()->TEST_WaitForCompact();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
Destroy(options);
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// options is not set, the checksum handoff will not be triggered
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::FlushMemTable:FlushMemTableFinished",
|
|
|
|
"BackgroundCallCompaction:0"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BackgroundCallCompaction:0", [&](void*) {
|
|
|
|
fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ASSERT_OK(Put(Key(1), "value3"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
s = dbfull()->TEST_WaitForCompact();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
Destroy(options);
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// The file system does not support checksum handoff. The check
|
|
|
|
// will be ignored.
|
|
|
|
fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_OK(Put(Key(1), "value3"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
s = dbfull()->TEST_WaitForCompact();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
|
|
|
|
// options is not set, the checksum handoff will not be triggered
|
|
|
|
fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::FlushMemTable:FlushMemTableFinished",
|
|
|
|
"BackgroundCallCompaction:0"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BackgroundCallCompaction:0",
|
|
|
|
[&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ASSERT_OK(Put(Key(1), "value3"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
s = dbfull()->TEST_WaitForCompact();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
|
|
|
|
Destroy(options);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest1) {
|
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
std::shared_ptr<FaultInjectionTestFS> fault_fs(
|
|
|
|
new FaultInjectionTestFS(FileSystem::Default()));
|
|
|
|
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.num_levels = 3;
|
|
|
|
options.env = fault_fs_env.get();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
|
|
|
|
Status s;
|
|
|
|
fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_OK(Put(Key(1), "value3"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
s = dbfull()->TEST_WaitForCompact();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
Destroy(options);
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// The hash does not match, compaction write fails
|
|
|
|
// fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
|
|
|
|
// Since the file system returns IOStatus::Corruption, it is mapped to
|
|
|
|
// kFatalError error.
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::FlushMemTable:FlushMemTableFinished",
|
|
|
|
"BackgroundCallCompaction:0"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BackgroundCallCompaction:0", [&](void*) {
|
|
|
|
fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ASSERT_OK(Put(Key(1), "value3"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
s = dbfull()->TEST_WaitForCompact();
|
|
|
|
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
Destroy(options);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) {
|
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
std::shared_ptr<FaultInjectionTestFS> fault_fs(
|
|
|
|
new FaultInjectionTestFS(FileSystem::Default()));
|
|
|
|
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.num_levels = 3;
|
|
|
|
options.env = fault_fs_env.get();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
|
|
|
|
Status s;
|
|
|
|
fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// The file system does not support checksum handoff. The check
|
|
|
|
// will be ignored.
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_OK(Put(Key(1), "value3"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
s = dbfull()->TEST_WaitForCompact();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
|
|
|
|
// Each write will be similated as corrupted.
|
|
|
|
// Since the file system returns IOStatus::Corruption, it is mapped to
|
|
|
|
// kFatalError error.
|
|
|
|
fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::FlushMemTable:FlushMemTableFinished",
|
|
|
|
"BackgroundCallCompaction:0"}});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BackgroundCallCompaction:0",
|
|
|
|
[&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ASSERT_OK(Put(Key(1), "value3"));
|
|
|
|
s = Flush();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
s = dbfull()->TEST_WaitForCompact();
|
|
|
|
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
|
|
|
|
Destroy(options);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, FIFOWarm) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.compaction_style = kCompactionStyleFIFO;
|
|
|
|
options.num_levels = 1;
|
|
|
|
options.max_open_files = -1;
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
CompactionOptionsFIFO fifo_options;
|
|
|
|
fifo_options.age_for_warm = 1000;
|
|
|
|
fifo_options.max_table_files_size = 100000000;
|
|
|
|
options.compaction_options_fifo = fifo_options;
|
|
|
|
env_->SetMockSleep();
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
int total_warm = 0;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"NewWritableFile::FileOptions.temperature", [&](void* arg) {
|
|
|
|
Temperature temperature = *(static_cast<Temperature*>(arg));
|
|
|
|
if (temperature == Temperature::kWarm) {
|
|
|
|
total_warm++;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// The file system does not support checksum handoff. The check
|
|
|
|
// will be ignored.
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
env_->MockSleepForSeconds(800);
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
env_->MockSleepForSeconds(800);
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
env_->MockSleepForSeconds(800);
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ASSERT_OK(Put(Key(0), "value1"));
|
|
|
|
env_->MockSleepForSeconds(800);
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
|
|
|
|
ColumnFamilyMetaData metadata;
|
|
|
|
db_->GetColumnFamilyMetaData(&metadata);
|
|
|
|
ASSERT_EQ(4, metadata.file_count);
|
|
|
|
ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
|
|
|
|
ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature);
|
|
|
|
ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[2].temperature);
|
|
|
|
ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[3].temperature);
|
|
|
|
ASSERT_EQ(2, total_warm);
|
|
|
|
|
|
|
|
Destroy(options);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, DisableMultiManualCompaction) {
|
|
|
|
const int kNumL0Files = 10;
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0Files;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// Generate 2 levels of file to make sure the manual compaction is not skipped
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
ASSERT_OK(Put(Key(i), "value"));
|
|
|
|
if (i % 2) {
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
ASSERT_OK(Put(Key(i), "value"));
|
|
|
|
if (i % 2) {
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
|
|
|
|
// Block compaction queue
|
|
|
|
test::SleepingBackgroundTask sleeping_task_low;
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
|
|
|
Env::Priority::LOW);
|
|
|
|
|
|
|
|
port::Thread compact_thread1([&]() {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = false;
|
|
|
|
std::string begin_str = Key(0);
|
|
|
|
std::string end_str = Key(3);
|
|
|
|
Slice b = begin_str;
|
|
|
|
Slice e = end_str;
|
|
|
|
auto s = db_->CompactRange(cro, &b, &e);
|
|
|
|
ASSERT_TRUE(s.IsIncomplete());
|
|
|
|
});
|
|
|
|
|
|
|
|
port::Thread compact_thread2([&]() {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = false;
|
|
|
|
std::string begin_str = Key(4);
|
|
|
|
std::string end_str = Key(7);
|
|
|
|
Slice b = begin_str;
|
|
|
|
Slice e = end_str;
|
|
|
|
auto s = db_->CompactRange(cro, &b, &e);
|
|
|
|
ASSERT_TRUE(s.IsIncomplete());
|
|
|
|
});
|
|
|
|
|
|
|
|
// Disable manual compaction should cancel both manual compactions and both
|
|
|
|
// compaction should return incomplete.
|
|
|
|
db_->DisableManualCompaction();
|
|
|
|
|
|
|
|
compact_thread1.join();
|
|
|
|
compact_thread2.join();
|
|
|
|
|
|
|
|
sleeping_task_low.WakeUp();
|
|
|
|
sleeping_task_low.WaitUntilDone();
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) {
|
|
|
|
const int kNumL0Files = 4;
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0Files;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// generate files, but avoid trigger auto compaction
|
|
|
|
for (int i = 0; i < kNumL0Files / 2; i++) {
|
|
|
|
ASSERT_OK(Put(Key(1), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
// make sure the manual compaction background is started but not yet set the
|
|
|
|
// status to in_progress, then cancel the manual compaction, which should not
|
|
|
|
// result in segfault
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::BGWorkCompaction",
|
|
|
|
"DBCompactionTest::DisableJustStartedManualCompaction:"
|
|
|
|
"PreDisableManualCompaction"},
|
|
|
|
{"DBImpl::RunManualCompaction:Unscheduled",
|
|
|
|
"BackgroundCallCompaction:0"}});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
port::Thread compact_thread([&]() {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = true;
|
|
|
|
auto s = db_->CompactRange(cro, nullptr, nullptr);
|
|
|
|
ASSERT_TRUE(s.IsIncomplete());
|
|
|
|
});
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::DisableJustStartedManualCompaction:"
|
|
|
|
"PreDisableManualCompaction");
|
|
|
|
db_->DisableManualCompaction();
|
|
|
|
|
|
|
|
compact_thread.join();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, DisableInProgressManualCompaction) {
|
|
|
|
const int kNumL0Files = 4;
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0Files;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::BackgroundCompaction:InProgress",
|
|
|
|
"DBCompactionTest::DisableInProgressManualCompaction:"
|
|
|
|
"PreDisableManualCompaction"},
|
|
|
|
{"DBImpl::RunManualCompaction:Unscheduled",
|
|
|
|
"CompactionJob::Run():Start"}});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// generate files, but avoid trigger auto compaction
|
|
|
|
for (int i = 0; i < kNumL0Files / 2; i++) {
|
|
|
|
ASSERT_OK(Put(Key(1), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
port::Thread compact_thread([&]() {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = true;
|
|
|
|
auto s = db_->CompactRange(cro, nullptr, nullptr);
|
|
|
|
ASSERT_TRUE(s.IsIncomplete());
|
|
|
|
});
|
|
|
|
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::DisableInProgressManualCompaction:"
|
|
|
|
"PreDisableManualCompaction");
|
|
|
|
db_->DisableManualCompaction();
|
|
|
|
|
|
|
|
compact_thread.join();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) {
|
|
|
|
const int kNumL0Files = 4;
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::RunManualCompaction:Scheduled",
|
|
|
|
"DBCompactionTest::DisableManualCompactionThreadQueueFull:"
|
|
|
|
"PreDisableManualCompaction"}});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0Files;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// Block compaction queue
|
|
|
|
test::SleepingBackgroundTask sleeping_task_low;
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
|
|
|
Env::Priority::LOW);
|
|
|
|
|
|
|
|
// generate files, but avoid trigger auto compaction
|
|
|
|
for (int i = 0; i < kNumL0Files / 2; i++) {
|
|
|
|
ASSERT_OK(Put(Key(1), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
port::Thread compact_thread([&]() {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = true;
|
|
|
|
auto s = db_->CompactRange(cro, nullptr, nullptr);
|
|
|
|
ASSERT_TRUE(s.IsIncomplete());
|
|
|
|
});
|
|
|
|
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::DisableManualCompactionThreadQueueFull:"
|
|
|
|
"PreDisableManualCompaction");
|
|
|
|
|
|
|
|
// Generate more files to trigger auto compaction which is scheduled after
|
|
|
|
// manual compaction. Has to generate 4 more files because existing files are
|
|
|
|
// pending compaction
|
|
|
|
for (int i = 0; i < kNumL0Files; i++) {
|
|
|
|
ASSERT_OK(Put(Key(1), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
|
|
|
|
|
|
|
|
db_->DisableManualCompaction();
|
|
|
|
|
|
|
|
// CompactRange should return before the compaction has the chance to run
|
|
|
|
compact_thread.join();
|
|
|
|
|
|
|
|
sleeping_task_low.WakeUp();
|
|
|
|
sleeping_task_low.WaitUntilDone();
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
|
|
|
|
ASSERT_EQ("0,1", FilesPerLevel(0));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) {
|
|
|
|
const int kNumL0Files = 4;
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::RunManualCompaction:Scheduled",
|
|
|
|
"DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
|
|
|
|
"PreDisableManualCompaction"}});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0Files;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// Block compaction queue
|
|
|
|
test::SleepingBackgroundTask sleeping_task_low;
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
|
|
|
Env::Priority::LOW);
|
|
|
|
|
|
|
|
// generate files, but avoid trigger auto compaction
|
|
|
|
for (int i = 0; i < kNumL0Files / 2; i++) {
|
|
|
|
ASSERT_OK(Put(Key(1), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
port::Thread compact_thread([&]() {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = true;
|
|
|
|
auto s = db_->CompactRange(cro, nullptr, nullptr);
|
|
|
|
ASSERT_TRUE(s.IsIncomplete());
|
|
|
|
});
|
|
|
|
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
|
|
|
|
"PreDisableManualCompaction");
|
|
|
|
|
|
|
|
// Generate more files to trigger auto compaction which is scheduled after
|
|
|
|
// manual compaction. Has to generate 4 more files because existing files are
|
|
|
|
// pending compaction
|
|
|
|
for (int i = 0; i < kNumL0Files; i++) {
|
|
|
|
ASSERT_OK(Put(Key(1), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
|
|
|
|
|
|
|
|
db_->DisableManualCompaction();
|
|
|
|
|
|
|
|
// CompactRange should return before the compaction has the chance to run
|
|
|
|
compact_thread.join();
|
|
|
|
|
|
|
|
// Try close DB while manual compaction is canceled but still in the queue.
|
|
|
|
// And an auto-triggered compaction is also in the queue.
|
|
|
|
auto s = db_->Close();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
sleeping_task_low.WakeUp();
|
|
|
|
sleeping_task_low.WaitUntilDone();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, DBCloseWithManualCompaction) {
|
|
|
|
const int kNumL0Files = 4;
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::RunManualCompaction:Scheduled",
|
|
|
|
"DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
|
|
|
|
"PreDisableManualCompaction"}});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0Files;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// Block compaction queue
|
|
|
|
test::SleepingBackgroundTask sleeping_task_low;
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
|
|
|
Env::Priority::LOW);
|
|
|
|
|
|
|
|
// generate files, but avoid trigger auto compaction
|
|
|
|
for (int i = 0; i < kNumL0Files / 2; i++) {
|
|
|
|
ASSERT_OK(Put(Key(1), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
port::Thread compact_thread([&]() {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = true;
|
|
|
|
auto s = db_->CompactRange(cro, nullptr, nullptr);
|
|
|
|
ASSERT_TRUE(s.IsIncomplete());
|
|
|
|
});
|
|
|
|
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
|
|
|
|
"PreDisableManualCompaction");
|
|
|
|
|
|
|
|
// Generate more files to trigger auto compaction which is scheduled after
|
|
|
|
// manual compaction. Has to generate 4 more files because existing files are
|
|
|
|
// pending compaction
|
|
|
|
for (int i = 0; i < kNumL0Files; i++) {
|
|
|
|
ASSERT_OK(Put(Key(1), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
|
|
|
|
|
|
|
|
// Close DB with manual compaction and auto triggered compaction in the queue.
|
|
|
|
auto s = db_->Close();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// manual compaction thread should return with Incomplete().
|
|
|
|
compact_thread.join();
|
|
|
|
|
|
|
|
sleeping_task_low.WakeUp();
|
|
|
|
sleeping_task_low.WaitUntilDone();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest,
|
|
|
|
DisableManualCompactionDoesNotWaitForDrainingAutomaticCompaction) {
|
|
|
|
// When `CompactRangeOptions::exclusive_manual_compaction == true`, we wait
|
|
|
|
// for automatic compactions to drain before starting the manual compaction.
|
|
|
|
// This test verifies `DisableManualCompaction()` can cancel such a compaction
|
|
|
|
// without waiting for the drain to complete.
|
|
|
|
const int kNumL0Files = 4;
|
|
|
|
|
|
|
|
// Enforces manual compaction enters wait loop due to pending automatic
|
|
|
|
// compaction.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::BGWorkCompaction", "DBImpl::RunManualCompaction:NotScheduled"},
|
|
|
|
{"DBImpl::RunManualCompaction:WaitScheduled",
|
|
|
|
"BackgroundCallCompaction:0"}});
|
|
|
|
// The automatic compaction will cancel the waiting manual compaction.
|
|
|
|
// Completing this implies the cancellation did not wait on automatic
|
|
|
|
// compactions to finish.
|
|
|
|
bool callback_completed = false;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BackgroundCallCompaction:0", [&](void* /*arg*/) {
|
|
|
|
db_->DisableManualCompaction();
|
|
|
|
callback_completed = true;
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0Files;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
for (int i = 0; i < kNumL0Files; ++i) {
|
|
|
|
ASSERT_OK(Put(Key(1), "value1"));
|
|
|
|
ASSERT_OK(Put(Key(2), "value2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.exclusive_manual_compaction = true;
|
|
|
|
ASSERT_TRUE(db_->CompactRange(cro, nullptr, nullptr).IsIncomplete());
|
|
|
|
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
ASSERT_TRUE(callback_completed);
|
|
|
|
}
|
|
|
|
|
Prevent corruption with parallel manual compactions and `change_level == true` (#9077)
Summary:
The bug can impact the following scenario. There must be two `CompactRange()`s, call them A and B. Compaction A must have `change_level=true`. Compactions A and B must run in parallel, and new data must be added while they run as well.
Now, on to the details of the race condition. Compaction A must reach the refitting phase while B's next step is to trivial move new data (i.e., data that has been inserted behind A) down to the same level that A's refit targets (`CompactRangeOptions::target_level`). B must be unregistered (i.e., has not yet called `AddManualCompaction()` for the current `RunManualCompaction()`) while A invokes `DisableManualCompaction()`s to prepare for refitting. In the old code, B could still proceed to register a manual compaction, while A had disabled manual compaction.
The next part of the race condition is B picks and schedules a trivial move while A has released the lock in refitting phase in order to persist the LSM state change (i.e., the log phase of `LogAndApply()`). That way, B does not see the refitted data when picking a trivial-move compaction. So it is susceptible to picking one that overlaps.
Finally, B executes the picked trivial-move compaction. Trivial-move compactions are special in that they never check whether manual compaction is disabled. So the picked compaction causing overlap ends up being applied, leading to LSM corruption if `force_consistency_checks=false`, or entering read-only mode with `Status::Corruption` if `force_consistency_checks=true` (the default).
The fix is just to prevent B from registering itself in `RunManualCompaction()` while manual compactions are disabled, consequently preventing any trivial move or other compaction from being picked/scheduled.
Thanks to siying for finding the bug.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9077
Test Plan: The test does not go all the way in exposing the bug because it requires a compaction to be picked/scheduled while logging LSM state change for RefitLevel(). But the fix is to make such a compaction not picked/scheduled in the first place, so any repro of that scenario would end up hanging RefitLevel() logging. So instead I just verified no such compaction is registered in the scenario where `RefitLevel()` disables manual compactions.
Reviewed By: siying
Differential Revision: D31921908
Pulled By: ajkr
fbshipit-source-id: 9bb5d0e847ad428211227f40830c685c209fbecb
3 years ago
|
|
|
TEST_F(DBCompactionTest, ChangeLevelConflictsWithManual) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.num_levels = 3;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// Setup an LSM with L2 populated.
|
|
|
|
Random rnd(301);
|
|
|
|
ASSERT_OK(Put(Key(0), rnd.RandomString(990)));
|
|
|
|
ASSERT_OK(Put(Key(1), rnd.RandomString(990)));
|
|
|
|
{
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = true;
|
|
|
|
cro.target_level = 2;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
}
|
|
|
|
ASSERT_EQ("0,0,1", FilesPerLevel(0));
|
|
|
|
|
|
|
|
// The background thread will refit L2->L1 while the foreground thread will
|
|
|
|
// attempt to run a compaction on new data. The following dependencies
|
|
|
|
// ensure the background manual compaction's refitting phase disables manual
|
|
|
|
// compaction immediately before the foreground manual compaction can register
|
|
|
|
// itself. Manual compaction is kept disabled until the foreground manual
|
|
|
|
// checks for the failure once.
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
|
|
|
|
// Only do Put()s for foreground CompactRange() once the background
|
|
|
|
// CompactRange() has reached the refitting phase.
|
|
|
|
{
|
|
|
|
"DBImpl::CompactRange:BeforeRefit:1",
|
|
|
|
"DBCompactionTest::ChangeLevelConflictsWithManual:"
|
|
|
|
"PreForegroundCompactRange",
|
|
|
|
},
|
|
|
|
// Right before we register the manual compaction, proceed with
|
|
|
|
// the refitting phase so manual compactions are disabled. Stay in
|
|
|
|
// the refitting phase with manual compactions disabled until it is
|
|
|
|
// noticed.
|
|
|
|
{
|
|
|
|
"DBImpl::RunManualCompaction:0",
|
|
|
|
"DBImpl::CompactRange:BeforeRefit:2",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"DBImpl::CompactRange:PreRefitLevel",
|
|
|
|
"DBImpl::RunManualCompaction:1",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"DBImpl::RunManualCompaction:PausedAtStart",
|
|
|
|
"DBImpl::CompactRange:PostRefitLevel",
|
|
|
|
},
|
|
|
|
// If compaction somehow were scheduled, let's let it run after reenabling
|
|
|
|
// manual compactions. This dependency is not expected to be hit but is
|
|
|
|
// here for speculatively coercing future bugs.
|
|
|
|
{
|
|
|
|
"DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled",
|
|
|
|
"BackgroundCallCompaction:0",
|
|
|
|
},
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.change_level = true;
|
|
|
|
cro.target_level = 1;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
});
|
|
|
|
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::ChangeLevelConflictsWithManual:"
|
|
|
|
"PreForegroundCompactRange");
|
|
|
|
ASSERT_OK(Put(Key(0), rnd.RandomString(990)));
|
|
|
|
ASSERT_OK(Put(Key(1), rnd.RandomString(990)));
|
|
|
|
ASSERT_TRUE(dbfull()
|
|
|
|
->CompactRange(CompactRangeOptions(), nullptr, nullptr)
|
|
|
|
.IsIncomplete());
|
|
|
|
|
|
|
|
refit_level_thread.join();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) {
|
|
|
|
// Flushes several files to trigger compaction while lock is released during
|
|
|
|
// a bottom-pri compaction. Verifies it does not get scheduled to thread pool
|
|
|
|
// because per-DB limit for compaction parallelism is one (default).
|
|
|
|
const int kNumL0Files = 4;
|
|
|
|
const int kNumLevels = 3;
|
|
|
|
|
|
|
|
env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.level0_file_num_compaction_trigger = kNumL0Files;
|
|
|
|
options.num_levels = kNumLevels;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Setup last level to be non-empty since it's a bit unclear whether
|
|
|
|
// compaction to an empty level would be considered "bottommost".
|
|
|
|
ASSERT_OK(Put(Key(0), "val"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
MoveFilesToLevel(kNumLevels - 1);
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::BGWorkBottomCompaction",
|
|
|
|
"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
|
|
|
|
"PreTriggerCompaction"},
|
|
|
|
{"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
|
|
|
|
"PostTriggerCompaction",
|
|
|
|
"BackgroundCallCompaction:0"}});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
port::Thread compact_range_thread([&] {
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
|
|
|
|
cro.exclusive_manual_compaction = false;
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
|
|
|
|
});
|
|
|
|
|
|
|
|
// Sleep in the low-pri thread so any newly scheduled compaction will be
|
|
|
|
// queued. Otherwise it might finish before we check its existence.
|
|
|
|
test::SleepingBackgroundTask sleeping_task_low;
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
|
|
|
Env::Priority::LOW);
|
|
|
|
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
|
|
|
|
"PreTriggerCompaction");
|
|
|
|
for (int i = 0; i < kNumL0Files; ++i) {
|
|
|
|
ASSERT_OK(Put(Key(0), "val"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
|
|
|
|
"PostTriggerCompaction");
|
|
|
|
|
|
|
|
sleeping_task_low.WakeUp();
|
|
|
|
sleeping_task_low.WaitUntilDone();
|
|
|
|
compact_range_thread.join();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBCompactionTest, BottommostFileCompactionAllowIngestBehind) {
|
|
|
|
// allow_ingest_behind prevents seqnum zeroing, and could cause
|
|
|
|
// compaction loop with reason kBottommostFiles.
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.compaction_style = kCompactionStyleLevel;
|
|
|
|
options.allow_ingest_behind = true;
|
|
|
|
options.comparator = BytewiseComparator();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
WriteOptions write_opts;
|
|
|
|
ASSERT_OK(db_->Put(write_opts, "infinite", "compaction loop"));
|
|
|
|
ASSERT_OK(db_->Put(write_opts, "infinite", "loop"));
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
ASSERT_OK(db_->Put(write_opts, "bumpseqnum", ""));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
auto snapshot = db_->GetSnapshot();
|
|
|
|
// Bump up oldest_snapshot_seqnum_ in VersionStorageInfo.
|
|
|
|
db_->ReleaseSnapshot(snapshot);
|
|
|
|
bool compacted = false;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"LevelCompactionPicker::PickCompaction:Return", [&](void* /* arg */) {
|
|
|
|
// There should not be a compaction.
|
|
|
|
compacted = true;
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
// Wait for compaction to be scheduled.
|
|
|
|
env_->SleepForMicroseconds(2000000);
|
|
|
|
ASSERT_FALSE(compacted);
|
|
|
|
// The following assert can be used to check for compaction loop:
|
|
|
|
// it used to wait forever before the fix.
|
|
|
|
// ASSERT_OK(dbfull()->TEST_WaitForCompact(true /* wait_unscheduled */));
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif // !defined(ROCKSDB_LITE)
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
|
|
|
#if !defined(ROCKSDB_LITE)
|
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
return RUN_ALL_TESTS();
|
|
|
|
#else
|
|
|
|
(void)argc;
|
|
|
|
(void)argv;
|
|
|
|
return 0;
|
|
|
|
#endif
|
|
|
|
}
|