|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include <cinttypes>
|
|
|
|
|
|
|
|
#include "db/db_impl/db_impl.h"
|
|
|
|
#include "db/error_handler.h"
|
|
|
|
#include "db/event_helpers.h"
|
|
|
|
#include "logging/logging.h"
|
|
|
|
#include "monitoring/perf_context_imp.h"
|
|
|
|
#include "options/options_helper.h"
|
|
|
|
#include "test_util/sync_point.h"
|
|
|
|
#include "util/cast_util.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
// Convenience methods
|
|
|
|
Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& val) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
const Status s = FailIfCfHasTs(column_family);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return DB::Put(o, column_family, key, val);
|
|
|
|
}
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& ts, const Slice& val) {
|
|
|
|
const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return DB::Put(o, column_family, key, ts, val);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::PutEntity(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const WideColumns& columns) {
|
|
|
|
const Status s = FailIfCfHasTs(column_family);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
return DB::PutEntity(options, column_family, key, columns);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& val) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
const Status s = FailIfCfHasTs(column_family);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
|
|
|
|
if (!cfh->cfd()->ioptions()->merge_operator) {
|
|
|
|
return Status::NotSupported("Provide a merge_operator when opening DB");
|
|
|
|
} else {
|
|
|
|
return DB::Merge(o, column_family, key, val);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::Delete(const WriteOptions& write_options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
const Status s = FailIfCfHasTs(column_family);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return DB::Delete(write_options, column_family, key);
|
|
|
|
}
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
Status DBImpl::Delete(const WriteOptions& write_options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& ts) {
|
|
|
|
const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return DB::Delete(write_options, column_family, key, ts);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::SingleDelete(const WriteOptions& write_options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
const Status s = FailIfCfHasTs(column_family);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return DB::SingleDelete(write_options, column_family, key);
|
|
|
|
}
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
Status DBImpl::SingleDelete(const WriteOptions& write_options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& ts) {
|
|
|
|
const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return DB::SingleDelete(write_options, column_family, key, ts);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::DeleteRange(const WriteOptions& write_options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& begin_key, const Slice& end_key) {
|
|
|
|
const Status s = FailIfCfHasTs(column_family);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return DB::DeleteRange(write_options, column_family, begin_key, end_key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::DeleteRange(const WriteOptions& write_options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& begin_key, const Slice& end_key,
|
|
|
|
const Slice& ts) {
|
|
|
|
const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return DB::DeleteRange(write_options, column_family, begin_key, end_key, ts);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::SetRecoverableStatePreReleaseCallback(
|
|
|
|
PreReleaseCallback* callback) {
|
|
|
|
recoverable_state_pre_release_callback_.reset(callback);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
|
|
|
|
Status s;
|
|
|
|
if (write_options.protection_bytes_per_key > 0) {
|
|
|
|
s = WriteBatchInternal::UpdateProtectionInfo(
|
|
|
|
my_batch, write_options.protection_bytes_per_key);
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
s = WriteImpl(write_options, my_batch, /*callback=*/nullptr,
|
|
|
|
/*log_used=*/nullptr);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
|
|
|
|
WriteBatch* my_batch,
|
|
|
|
WriteCallback* callback) {
|
|
|
|
Status s;
|
|
|
|
if (write_options.protection_bytes_per_key > 0) {
|
|
|
|
s = WriteBatchInternal::UpdateProtectionInfo(
|
|
|
|
my_batch, write_options.protection_bytes_per_key);
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
s = WriteImpl(write_options, my_batch, callback, nullptr);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
|
|
|
|
// The main write queue. This is the only write queue that updates LastSequence.
|
|
|
|
// When using one write queue, the same sequence also indicates the last
|
|
|
|
// published sequence.
|
|
|
|
Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
|
WriteBatch* my_batch, WriteCallback* callback,
|
|
|
|
uint64_t* log_used, uint64_t log_ref,
|
|
|
|
bool disable_memtable, uint64_t* seq_used,
|
|
|
|
size_t batch_cnt,
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
PreReleaseCallback* pre_release_callback,
|
|
|
|
PostMemTableCallback* post_memtable_callback) {
|
|
|
|
assert(!seq_per_batch_ || batch_cnt != 0);
|
|
|
|
assert(my_batch == nullptr || my_batch->Count() == 0 ||
|
|
|
|
write_options.protection_bytes_per_key == 0 ||
|
|
|
|
write_options.protection_bytes_per_key ==
|
|
|
|
my_batch->GetProtectionBytesPerKey());
|
|
|
|
if (my_batch == nullptr) {
|
|
|
|
return Status::InvalidArgument("Batch is nullptr!");
|
Support user-defined timestamps in write-committed txns (#9629)
Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9629
Pessimistic transactions use pessimistic concurrency control, i.e. locking. Keys are
locked upon first operation that writes the key or has the intention of writing. For example,
`PessimisticTransaction::Put()`, `PessimisticTransaction::Delete()`,
`PessimisticTransaction::SingleDelete()` will write to or delete a key, while
`PessimisticTransaction::GetForUpdate()` is used by application to indicate
to RocksDB that the transaction has the intention of performing write operation later
in the same transaction.
Pessimistic transactions support two-phase commit (2PC). A transaction can be
`Prepared()`'ed and then `Commit()`. The prepare phase is similar to a promise: once
`Prepare()` succeeds, the transaction has acquired the necessary resources to commit.
The resources include locks, persistence of WAL, etc.
Write-committed transaction is the default pessimistic transaction implementation. In
RocksDB write-committed transaction, `Prepare()` will write data to the WAL as a prepare
section. `Commit()` will write a commit marker to the WAL and then write data to the
memtables. While writing to the memtables, different keys in the transaction's write batch
will be assigned different sequence numbers in ascending order.
Until commit/rollback, the transaction holds locks on the keys so that no other transaction
can write to the same keys. Furthermore, the keys' sequence numbers represent the order
in which they are committed and should be made visible. This is convenient for us to
implement support for user-defined timestamps.
Since column families with and without timestamps can co-exist in the same database,
a transaction may or may not involve timestamps. Based on this observation, we add two
optional members to each `PessimisticTransaction`, `read_timestamp_` and
`commit_timestamp_`. If no key in the transaction's write batch has timestamp, then
setting these two variables do not have any effect. For the rest of this commit, we discuss
only the cases when these two variables are meaningful.
read_timestamp_ is used mainly for validation, and should be set before first call to
`GetForUpdate()`. Otherwise, the latter will return non-ok status. `GetForUpdate()` calls
`TryLock()` that can verify if another transaction has written the same key since
`read_timestamp_` till this call to `GetForUpdate()`. If another transaction has indeed
written the same key, then validation fails, and RocksDB allows this transaction to
refine `read_timestamp_` by increasing it. Note that a transaction can still use `Get()`
with a different timestamp to read, but the result of the read should not be used to
determine data that will be written later.
commit_timestamp_ must be set after finishing writing and before transaction commit.
This applies to both 2PC and non-2PC cases. In the case of 2PC, it's usually set after
prepare phase succeeds.
We currently require that the commit timestamp be chosen after all keys are locked. This
means we disallow the `TransactionDB`-level APIs if user-defined timestamp is used
by the transaction. Specifically, calling `PessimisticTransactionDB::Put()`,
`PessimisticTransactionDB::Delete()`, `PessimisticTransactionDB::SingleDelete()`,
etc. will return non-ok status because they specify timestamps before locking the keys.
Users are also prompted to use the `Transaction` APIs when they receive the non-ok status.
Reviewed By: ltamasi
Differential Revision: D31822445
fbshipit-source-id: b82abf8e230216dc89cc519564a588224a88fd43
3 years ago
|
|
|
} else if (!disable_memtable &&
|
|
|
|
WriteBatchInternal::TimestampsUpdateNeeded(*my_batch)) {
|
|
|
|
// If writing to memtable, then we require the caller to set/update the
|
|
|
|
// timestamps for the keys in the write batch.
|
|
|
|
// Otherwise, it means we are just writing to the WAL, and we allow
|
|
|
|
// timestamps unset for the keys in the write batch. This can happen if we
|
|
|
|
// use TransactionDB with write-committed policy, and we currently do not
|
|
|
|
// support user-defined timestamp with other policies.
|
|
|
|
// In the prepare phase, a transaction can write the batch to the WAL
|
|
|
|
// without inserting to memtable. The keys in the batch do not have to be
|
|
|
|
// assigned timestamps because they will be used only during recovery if
|
|
|
|
// there is a commit marker which includes their commit timestamp.
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
return Status::InvalidArgument("write batch must have timestamp(s) set");
|
Rate-limit automatic WAL flush after each user write (#9607)
Summary:
**Context:**
WAL flush is currently not rate-limited by `Options::rate_limiter`. This PR is to provide rate-limiting to auto WAL flush, the one that automatically happen after each user write operation (i.e, `Options::manual_wal_flush == false`), by adding `WriteOptions::rate_limiter_options`.
Note that we are NOT rate-limiting WAL flush that do NOT automatically happen after each user write, such as `Options::manual_wal_flush == true + manual FlushWAL()` (rate-limiting multiple WAL flushes), for the benefits of:
- being consistent with [ReadOptions::rate_limiter_priority](https://github.com/facebook/rocksdb/blob/7.0.fb/include/rocksdb/options.h#L515)
- being able to turn off some WAL flush's rate-limiting but not all (e.g, turn off specific the WAL flush of a critical user write like a service's heartbeat)
`WriteOptions::rate_limiter_options` only accept `Env::IO_USER` and `Env::IO_TOTAL` currently due to an implementation constraint.
- The constraint is that we currently queue parallel writes (including WAL writes) based on FIFO policy which does not factor rate limiter priority into this layer's scheduling. If we allow lower priorities such as `Env::IO_HIGH/MID/LOW` and such writes specified with lower priorities occurs before ones specified with higher priorities (even just by a tiny bit in arrival time), the former would have blocked the latter, leading to a "priority inversion" issue and contradictory to what we promise for rate-limiting priority. Therefore we only allow `Env::IO_USER` and `Env::IO_TOTAL` right now before improving that scheduling.
A pre-requisite to this feature is to support operation-level rate limiting in `WritableFileWriter`, which is also included in this PR.
**Summary:**
- Renamed test suite `DBRateLimiterTest to DBRateLimiterOnReadTest` for adding a new test suite
- Accept `rate_limiter_priority` in `WritableFileWriter`'s private and public write functions
- Passed `WriteOptions::rate_limiter_options` to `WritableFileWriter` in the path of automatic WAL flush.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9607
Test Plan:
- Added new unit test to verify existing flush/compaction rate-limiting does not break, since `DBTest, RateLimitingTest` is disabled and current db-level rate-limiting tests focus on read only (e.g, `db_rate_limiter_test`, `DBTest2, RateLimitedCompactionReads`).
- Added new unit test `DBRateLimiterOnWriteWALTest, AutoWalFlush`
- `strace -ftt -e trace=write ./db_bench -benchmarks=fillseq -db=/dev/shm/testdb -rate_limit_auto_wal_flush=1 -rate_limiter_bytes_per_sec=15 -rate_limiter_refill_period_us=1000000 -write_buffer_size=100000000 -disable_auto_compactions=1 -num=100`
- verified that WAL flush(i.e, system-call _write_) were chunked into 15 bytes and each _write_ was roughly 1 second apart
- verified the chunking disappeared when `-rate_limit_auto_wal_flush=0`
- crash test: `python3 tools/db_crashtest.py blackbox --disable_wal=0 --rate_limit_auto_wal_flush=1 --rate_limiter_bytes_per_sec=10485760 --interval=10` killed as normal
**Benchmarked on flush/compaction to ensure no performance regression:**
- compaction with rate-limiting (see table 1, avg over 1280-run): pre-change: **915635 micros/op**; post-change:
**907350 micros/op (improved by 0.106%)**
```
#!/bin/bash
TEST_TMPDIR=/dev/shm/testdb
START=1
NUM_DATA_ENTRY=8
N=10
rm -f compact_bmk_output.txt compact_bmk_output_2.txt dont_care_output.txt
for i in $(eval echo "{$START..$NUM_DATA_ENTRY}")
do
NUM_RUN=$(($N*(2**($i-1))))
for j in $(eval echo "{$START..$NUM_RUN}")
do
./db_bench --benchmarks=fillrandom -db=$TEST_TMPDIR -disable_auto_compactions=1 -write_buffer_size=6710886 > dont_care_output.txt && ./db_bench --benchmarks=compact -use_existing_db=1 -db=$TEST_TMPDIR -level0_file_num_compaction_trigger=1 -rate_limiter_bytes_per_sec=100000000 | egrep 'compact'
done > compact_bmk_output.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' compact_bmk_output.txt >> compact_bmk_output_2.txt
done
```
- compaction w/o rate-limiting (see table 2, avg over 640-run): pre-change: **822197 micros/op**; post-change: **823148 micros/op (regressed by 0.12%)**
```
Same as above script, except that -rate_limiter_bytes_per_sec=0
```
- flush with rate-limiting (see table 3, avg over 320-run, run on the [patch](https://github.com/hx235/rocksdb/commit/ee5c6023a9f6533fab9afdc681568daa21da4953) to augment current db_bench ): pre-change: **745752 micros/op**; post-change: **745331 micros/op (regressed by 0.06 %)**
```
#!/bin/bash
TEST_TMPDIR=/dev/shm/testdb
START=1
NUM_DATA_ENTRY=8
N=10
rm -f flush_bmk_output.txt flush_bmk_output_2.txt
for i in $(eval echo "{$START..$NUM_DATA_ENTRY}")
do
NUM_RUN=$(($N*(2**($i-1))))
for j in $(eval echo "{$START..$NUM_RUN}")
do
./db_bench -db=$TEST_TMPDIR -write_buffer_size=1048576000 -num=1000000 -rate_limiter_bytes_per_sec=100000000 -benchmarks=fillseq,flush | egrep 'flush'
done > flush_bmk_output.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' flush_bmk_output.txt >> flush_bmk_output_2.txt
done
```
- flush w/o rate-limiting (see table 4, avg over 320-run, run on the [patch](https://github.com/hx235/rocksdb/commit/ee5c6023a9f6533fab9afdc681568daa21da4953) to augment current db_bench): pre-change: **487512 micros/op**, post-change: **485856 micors/ops (improved by 0.34%)**
```
Same as above script, except that -rate_limiter_bytes_per_sec=0
```
| table 1 - compact with rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 896978 | 16046.9 | 901242 | 15670.9 | 0.475373978
20 | 893718 | 15813 | 886505 | 17544.7 | -0.8070778478
40 | 900426 | 23882.2 | 894958 | 15104.5 | -0.6072681153
80 | 906635 | 21761.5 | 903332 | 23948.3 | -0.3643141948
160 | 898632 | 21098.9 | 907583 | 21145 | 0.9960695813
3.20E+02 | 905252 | 22785.5 | 908106 | 25325.5 | 0.3152713278
6.40E+02 | 905213 | 23598.6 | 906741 | 21370.5 | 0.1688000504
**1.28E+03** | **908316** | **23533.1** | **907350** | **24626.8** | **-0.1063506533**
average over #-run | 901896.25 | 21064.9625 | 901977.125 | 20592.025 | 0.008967217682
| table 2 - compact w/o rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 811211 | 26996.7 | 807586 | 28456.4 | -0.4468627768
20 | 815465 | 14803.7 | 814608 | 28719.7 | -0.105093413
40 | 809203 | 26187.1 | 797835 | 25492.1 | -1.404839082
80 | 822088 | 28765.3 | 822192 | 32840.4 | 0.01265071379
160 | 821719 | 36344.7 | 821664 | 29544.9 | -0.006693285661
3.20E+02 | 820921 | 27756.4 | 821403 | 28347.7 | 0.05871454135
**6.40E+02** | **822197** | **28960.6** | **823148** | **30055.1** | **0.1156657103**
average over #-run | 8.18E+05 | 2.71E+04 | 8.15E+05 | 2.91E+04 | -0.25
| table 3 - flush with rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 741721 | 11770.8 | 740345 | 5949.76 | -0.1855144994
20 | 735169 | 3561.83 | 743199 | 9755.77 | 1.09226586
40 | 743368 | 8891.03 | 742102 | 8683.22 | -0.1703059588
80 | 742129 | 8148.51 | 743417 | 9631.58| 0.1735547324
160 | 749045 | 9757.21 | 746256 | 9191.86 | -0.3723407806
**3.20E+02** | **745752** | **9819.65** | **745331** | **9840.62** | **-0.0564530836**
6.40E+02 | 749006 | 11080.5 | 748173 | 10578.7 | -0.1112140624
average over #-run | 743741.4286 | 9004.218571 | 744117.5714 | 9090.215714 | 0.05057441238
| table 4 - flush w/o rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 477283 | 24719.6 | 473864 | 12379 | -0.7163464863
20 | 486743 | 20175.2 | 502296 | 23931.3 | 3.195320734
40 | 482846 | 15309.2 | 489820 | 22259.5 | 1.444352858
80 | 491490 | 21883.1 | 490071 | 23085.7 | -0.2887139108
160 | 493347 | 28074.3 | 483609 | 21211.7 | -1.973864238
**3.20E+02** | **487512** | **21401.5** | **485856** | **22195.2** | **-0.3396839462**
6.40E+02 | 490307 | 25418.6 | 485435 | 22405.2 | -0.9936631539
average over #-run | 4.87E+05 | 2.24E+04 | 4.87E+05 | 2.11E+04 | 0.00E+00
Reviewed By: ajkr
Differential Revision: D34442441
Pulled By: hx235
fbshipit-source-id: 4790f13e1e5c0a95ae1d1cc93ffcf69dc6e78bdd
3 years ago
|
|
|
} else if (write_options.rate_limiter_priority != Env::IO_TOTAL &&
|
|
|
|
write_options.rate_limiter_priority != Env::IO_USER) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"WriteOptions::rate_limiter_priority only allows "
|
|
|
|
"Env::IO_TOTAL and Env::IO_USER due to implementation constraints");
|
|
|
|
} else if (write_options.rate_limiter_priority != Env::IO_TOTAL &&
|
|
|
|
(write_options.disableWAL || manual_wal_flush_)) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"WriteOptions::rate_limiter_priority currently only supports "
|
|
|
|
"rate-limiting automatic WAL flush, which requires "
|
|
|
|
"`WriteOptions::disableWAL` and "
|
|
|
|
"`DBOptions::manual_wal_flush` both set to false");
|
|
|
|
} else if (write_options.protection_bytes_per_key != 0 &&
|
|
|
|
write_options.protection_bytes_per_key != 8) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"`WriteOptions::protection_bytes_per_key` must be zero or eight");
|
|
|
|
}
|
|
|
|
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
|
|
|
|
// grabs but does not seem thread-safe.
|
|
|
|
if (tracer_) {
|
|
|
|
InstrumentedMutexLock lock(&trace_mutex_);
|
|
|
|
if (tracer_ && !tracer_->IsWriteOrderPreserved()) {
|
|
|
|
// We don't have to preserve write order so can trace anywhere. It's more
|
|
|
|
// efficient to trace here than to add latency to a phase of the log/apply
|
|
|
|
// pipeline.
|
|
|
|
// TODO: maybe handle the tracing status?
|
|
|
|
tracer_->Write(my_batch).PermitUncheckedError();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (write_options.sync && write_options.disableWAL) {
|
|
|
|
return Status::InvalidArgument("Sync writes has to enable WAL.");
|
|
|
|
}
|
|
|
|
if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"pipelined_writes is not compatible with concurrent prepares");
|
|
|
|
}
|
|
|
|
if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) {
|
|
|
|
// TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt
|
|
|
|
return Status::NotSupported(
|
|
|
|
"pipelined_writes is not compatible with seq_per_batch");
|
|
|
|
}
|
|
|
|
if (immutable_db_options_.unordered_write &&
|
|
|
|
immutable_db_options_.enable_pipelined_write) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"pipelined_writes is not compatible with unordered_write");
|
|
|
|
}
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
if (immutable_db_options_.enable_pipelined_write &&
|
|
|
|
post_memtable_callback != nullptr) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"pipelined write currently does not honor post_memtable_callback");
|
|
|
|
}
|
|
|
|
if (seq_per_batch_ && post_memtable_callback != nullptr) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"seq_per_batch currently does not honor post_memtable_callback");
|
|
|
|
}
|
|
|
|
// Otherwise IsLatestPersistentState optimization does not make sense
|
|
|
|
assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
|
|
|
|
disable_memtable);
|
|
|
|
|
|
|
|
if (write_options.low_pri) {
|
|
|
|
Status s = ThrottleLowPriWritesIfNeeded(write_options, my_batch);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (two_write_queues_ && disable_memtable) {
|
|
|
|
AssignOrder assign_order =
|
|
|
|
seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder;
|
|
|
|
// Otherwise it is WAL-only Prepare batches in WriteCommitted policy and
|
|
|
|
// they don't consume sequence.
|
|
|
|
return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch,
|
|
|
|
callback, log_used, log_ref, seq_used, batch_cnt,
|
|
|
|
pre_release_callback, assign_order,
|
|
|
|
kDontPublishLastSeq, disable_memtable);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (immutable_db_options_.unordered_write) {
|
|
|
|
const size_t sub_batch_cnt = batch_cnt != 0
|
|
|
|
? batch_cnt
|
|
|
|
// every key is a sub-batch consuming a seq
|
|
|
|
: WriteBatchInternal::Count(my_batch);
|
|
|
|
uint64_t seq = 0;
|
|
|
|
// Use a write thread to i) optimize for WAL write, ii) publish last
|
|
|
|
// sequence in in increasing order, iii) call pre_release_callback serially
|
|
|
|
Status status = WriteImplWALOnly(
|
|
|
|
&write_thread_, write_options, my_batch, callback, log_used, log_ref,
|
|
|
|
&seq, sub_batch_cnt, pre_release_callback, kDoAssignOrder,
|
|
|
|
kDoPublishLastSeq, disable_memtable);
|
|
|
|
TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL");
|
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
if (seq_used) {
|
|
|
|
*seq_used = seq;
|
|
|
|
}
|
|
|
|
if (!disable_memtable) {
|
|
|
|
TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeUnorderedWriteMemtable");
|
|
|
|
status = UnorderedWriteMemtable(write_options, my_batch, callback,
|
|
|
|
log_ref, seq, sub_batch_cnt);
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (immutable_db_options_.enable_pipelined_write) {
|
|
|
|
return PipelinedWriteImpl(write_options, my_batch, callback, log_used,
|
|
|
|
log_ref, disable_memtable, seq_used);
|
|
|
|
}
|
|
|
|
|
|
|
|
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
|
|
|
WriteThread::Writer w(write_options, my_batch, callback, log_ref,
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
disable_memtable, batch_cnt, pre_release_callback,
|
|
|
|
post_memtable_callback);
|
|
|
|
StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
|
|
|
|
|
|
|
|
write_thread_.JoinBatchGroup(&w);
|
|
|
|
if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
|
|
|
|
// we are a non-leader in a parallel group
|
|
|
|
|
|
|
|
if (w.ShouldWriteToMemtable()) {
|
|
|
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
|
|
|
PERF_TIMER_GUARD(write_memtable_time);
|
|
|
|
|
|
|
|
ColumnFamilyMemTablesImpl column_family_memtables(
|
|
|
|
versions_->GetColumnFamilySet());
|
|
|
|
w.status = WriteBatchInternal::InsertInto(
|
|
|
|
&w, w.sequence, &column_family_memtables, &flush_scheduler_,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
&trim_history_scheduler_,
|
|
|
|
write_options.ignore_missing_column_families, 0 /*log_number*/, this,
|
|
|
|
true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt,
|
|
|
|
batch_per_txn_, write_options.memtable_insert_hint_per_batch);
|
|
|
|
|
|
|
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (write_thread_.CompleteParallelMemTableWriter(&w)) {
|
|
|
|
// we're responsible for exit batch group
|
|
|
|
// TODO(myabandeh): propagate status to write_group
|
|
|
|
auto last_sequence = w.write_group->last_sequence;
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
for (auto* tmp_w : *(w.write_group)) {
|
|
|
|
assert(tmp_w);
|
|
|
|
if (tmp_w->post_memtable_callback) {
|
|
|
|
Status tmp_s =
|
|
|
|
(*tmp_w->post_memtable_callback)(last_sequence, disable_memtable);
|
|
|
|
// TODO: propagate the execution status of post_memtable_callback to
|
|
|
|
// caller.
|
|
|
|
assert(tmp_s.ok());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
versions_->SetLastSequence(last_sequence);
|
|
|
|
MemTableInsertStatusCheck(w.status);
|
|
|
|
write_thread_.ExitAsBatchGroupFollower(&w);
|
|
|
|
}
|
|
|
|
assert(w.state == WriteThread::STATE_COMPLETED);
|
|
|
|
// STATE_COMPLETED conditional below handles exit
|
|
|
|
}
|
|
|
|
if (w.state == WriteThread::STATE_COMPLETED) {
|
|
|
|
if (log_used != nullptr) {
|
|
|
|
*log_used = w.log_used;
|
|
|
|
}
|
|
|
|
if (seq_used != nullptr) {
|
|
|
|
*seq_used = w.sequence;
|
|
|
|
}
|
|
|
|
// write is complete and leader has updated sequence
|
|
|
|
return w.FinalStatus();
|
|
|
|
}
|
|
|
|
// else we are the leader of the write batch group
|
|
|
|
assert(w.state == WriteThread::STATE_GROUP_LEADER);
|
|
|
|
Status status;
|
|
|
|
// Once reaches this point, the current writer "w" will try to do its write
|
|
|
|
// job. It may also pick up some of the remaining writers in the "writers_"
|
|
|
|
// when it finds suitable, and finish them in the same write batch.
|
|
|
|
// This is how a write job could be done by the other writer.
|
|
|
|
WriteContext write_context;
|
|
|
|
LogContext log_context(write_options.sync);
|
|
|
|
WriteThread::WriteGroup write_group;
|
|
|
|
bool in_parallel_group = false;
|
|
|
|
uint64_t last_sequence = kMaxSequenceNumber;
|
|
|
|
|
|
|
|
assert(!two_write_queues_ || !disable_memtable);
|
|
|
|
{
|
|
|
|
// With concurrent writes we do preprocess only in the write thread that
|
|
|
|
// also does write to memtable to avoid sync issue on shared data structure
|
|
|
|
// with the other thread
|
|
|
|
|
|
|
|
// PreprocessWrite does its own perf timing.
|
|
|
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
|
|
|
|
|
|
|
status = PreprocessWrite(write_options, &log_context, &write_context);
|
|
|
|
if (!two_write_queues_) {
|
|
|
|
// Assign it after ::PreprocessWrite since the sequence might advance
|
|
|
|
// inside it by WriteRecoverableState
|
|
|
|
last_sequence = versions_->LastSequence();
|
|
|
|
}
|
|
|
|
|
|
|
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add to log and apply to memtable. We can release the lock
|
|
|
|
// during this phase since &w is currently responsible for logging
|
|
|
|
// and protects against concurrent loggers and concurrent writes
|
|
|
|
// into memtables
|
|
|
|
|
|
|
|
TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters");
|
|
|
|
last_batch_group_size_ =
|
|
|
|
write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
|
|
|
|
|
|
|
|
IOStatus io_s;
|
|
|
|
Status pre_release_cb_status;
|
|
|
|
if (status.ok()) {
|
|
|
|
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
|
|
|
|
// grabs but does not seem thread-safe.
|
|
|
|
if (tracer_) {
|
|
|
|
InstrumentedMutexLock lock(&trace_mutex_);
|
|
|
|
if (tracer_ && tracer_->IsWriteOrderPreserved()) {
|
|
|
|
for (auto* writer : write_group) {
|
|
|
|
// TODO: maybe handle the tracing status?
|
|
|
|
tracer_->Write(writer->batch).PermitUncheckedError();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Rules for when we can update the memtable concurrently
|
|
|
|
// 1. supported by memtable
|
|
|
|
// 2. Puts are not okay if inplace_update_support
|
|
|
|
// 3. Merges are not okay
|
|
|
|
//
|
|
|
|
// Rules 1..2 are enforced by checking the options
|
|
|
|
// during startup (CheckConcurrentWritesSupported), so if
|
|
|
|
// options.allow_concurrent_memtable_write is true then they can be
|
|
|
|
// assumed to be true. Rule 3 is checked for each batch. We could
|
|
|
|
// relax rules 2 if we could prevent write batches from referring
|
|
|
|
// more than once to a particular key.
|
|
|
|
bool parallel = immutable_db_options_.allow_concurrent_memtable_write &&
|
|
|
|
write_group.size > 1;
|
|
|
|
size_t total_count = 0;
|
|
|
|
size_t valid_batches = 0;
|
|
|
|
size_t total_byte_size = 0;
|
|
|
|
size_t pre_release_callback_cnt = 0;
|
|
|
|
for (auto* writer : write_group) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
assert(writer);
|
|
|
|
if (writer->CheckCallback(this)) {
|
|
|
|
valid_batches += writer->batch_cnt;
|
|
|
|
if (writer->ShouldWriteToMemtable()) {
|
|
|
|
total_count += WriteBatchInternal::Count(writer->batch);
|
|
|
|
parallel = parallel && !writer->batch->HasMerge();
|
|
|
|
}
|
|
|
|
total_byte_size = WriteBatchInternal::AppendedByteSize(
|
|
|
|
total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
|
|
|
|
if (writer->pre_release_callback) {
|
|
|
|
pre_release_callback_cnt++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Note about seq_per_batch_: either disableWAL is set for the entire write
|
|
|
|
// group or not. In either case we inc seq for each write batch with no
|
|
|
|
// failed callback. This means that there could be a batch with
|
|
|
|
// disalbe_memtable in between; although we do not write this batch to
|
|
|
|
// memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc
|
|
|
|
// the seq per valid written key to mem.
|
|
|
|
size_t seq_inc = seq_per_batch_ ? valid_batches : total_count;
|
|
|
|
|
|
|
|
const bool concurrent_update = two_write_queues_;
|
|
|
|
// Update stats while we are an exclusive group leader, so we know
|
|
|
|
// that nobody else can be writing to these particular stats.
|
|
|
|
// We're optimistic, updating the stats before we successfully
|
|
|
|
// commit. That lets us release our leader status early.
|
|
|
|
auto stats = default_cf_internal_stats_;
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count,
|
|
|
|
concurrent_update);
|
|
|
|
RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
|
|
|
|
concurrent_update);
|
|
|
|
RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
|
|
|
|
concurrent_update);
|
|
|
|
RecordTick(stats_, WRITE_DONE_BY_SELF);
|
|
|
|
auto write_done_by_other = write_group.size - 1;
|
|
|
|
if (write_done_by_other > 0) {
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
|
|
|
|
write_done_by_other, concurrent_update);
|
|
|
|
RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
|
|
|
|
}
|
|
|
|
RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
|
|
|
|
|
|
|
|
if (write_options.disableWAL) {
|
|
|
|
has_unpersisted_data_.store(true, std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
|
|
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
|
|
|
|
|
|
|
if (!two_write_queues_) {
|
|
|
|
if (status.ok() && !write_options.disableWAL) {
|
|
|
|
assert(log_context.log_file_number_size);
|
|
|
|
LogFileNumberSize& log_file_number_size =
|
|
|
|
*(log_context.log_file_number_size);
|
|
|
|
PERF_TIMER_GUARD(write_wal_time);
|
|
|
|
io_s =
|
|
|
|
WriteToWAL(write_group, log_context.writer, log_used,
|
|
|
|
log_context.need_log_sync, log_context.need_log_dir_sync,
|
|
|
|
last_sequence + 1, log_file_number_size);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (status.ok() && !write_options.disableWAL) {
|
|
|
|
PERF_TIMER_GUARD(write_wal_time);
|
|
|
|
// LastAllocatedSequence is increased inside WriteToWAL under
|
|
|
|
// wal_write_mutex_ to ensure ordered events in WAL
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
|
|
|
|
seq_inc);
|
|
|
|
} else {
|
|
|
|
// Otherwise we inc seq number for memtable writes
|
|
|
|
last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
|
|
|
|
}
|
|
|
|
}
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
status = io_s;
|
|
|
|
assert(last_sequence != kMaxSequenceNumber);
|
|
|
|
const SequenceNumber current_sequence = last_sequence + 1;
|
|
|
|
last_sequence += seq_inc;
|
|
|
|
|
|
|
|
// PreReleaseCallback is called after WAL write and before memtable write
|
|
|
|
if (status.ok()) {
|
|
|
|
SequenceNumber next_sequence = current_sequence;
|
|
|
|
size_t index = 0;
|
|
|
|
// Note: the logic for advancing seq here must be consistent with the
|
|
|
|
// logic in WriteBatchInternal::InsertInto(write_group...) as well as
|
|
|
|
// with WriteBatchInternal::InsertInto(write_batch...) that is called on
|
|
|
|
// the merged batch during recovery from the WAL.
|
|
|
|
for (auto* writer : write_group) {
|
|
|
|
if (writer->CallbackFailed()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
writer->sequence = next_sequence;
|
|
|
|
if (writer->pre_release_callback) {
|
|
|
|
Status ws = writer->pre_release_callback->Callback(
|
|
|
|
writer->sequence, disable_memtable, writer->log_used, index++,
|
|
|
|
pre_release_callback_cnt);
|
|
|
|
if (!ws.ok()) {
|
|
|
|
status = pre_release_cb_status = ws;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (seq_per_batch_) {
|
|
|
|
assert(writer->batch_cnt);
|
|
|
|
next_sequence += writer->batch_cnt;
|
|
|
|
} else if (writer->ShouldWriteToMemtable()) {
|
|
|
|
next_sequence += WriteBatchInternal::Count(writer->batch);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (status.ok()) {
|
|
|
|
PERF_TIMER_GUARD(write_memtable_time);
|
|
|
|
|
|
|
|
if (!parallel) {
|
|
|
|
// w.sequence will be set inside InsertInto
|
|
|
|
w.status = WriteBatchInternal::InsertInto(
|
|
|
|
write_group, current_sequence, column_family_memtables_.get(),
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
&flush_scheduler_, &trim_history_scheduler_,
|
|
|
|
write_options.ignore_missing_column_families,
|
|
|
|
0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
|
|
|
|
batch_per_txn_);
|
|
|
|
} else {
|
|
|
|
write_group.last_sequence = last_sequence;
|
|
|
|
write_thread_.LaunchParallelMemTableWriters(&write_group);
|
|
|
|
in_parallel_group = true;
|
|
|
|
|
|
|
|
// Each parallel follower is doing each own writes. The leader should
|
|
|
|
// also do its own.
|
|
|
|
if (w.ShouldWriteToMemtable()) {
|
|
|
|
ColumnFamilyMemTablesImpl column_family_memtables(
|
|
|
|
versions_->GetColumnFamilySet());
|
|
|
|
assert(w.sequence == current_sequence);
|
|
|
|
w.status = WriteBatchInternal::InsertInto(
|
|
|
|
&w, w.sequence, &column_family_memtables, &flush_scheduler_,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
&trim_history_scheduler_,
|
|
|
|
write_options.ignore_missing_column_families, 0 /*log_number*/,
|
|
|
|
this, true /*concurrent_memtable_writes*/, seq_per_batch_,
|
|
|
|
w.batch_cnt, batch_per_txn_,
|
|
|
|
write_options.memtable_insert_hint_per_batch);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (seq_used != nullptr) {
|
|
|
|
*seq_used = w.sequence;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
|
|
|
|
|
|
|
if (!io_s.ok()) {
|
|
|
|
// Check WriteToWAL status
|
|
|
|
IOStatusCheck(io_s);
|
|
|
|
}
|
|
|
|
if (!w.CallbackFailed()) {
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
if (!io_s.ok()) {
|
|
|
|
assert(pre_release_cb_status.ok());
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
} else {
|
|
|
|
WriteStatusCheck(pre_release_cb_status);
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
}
|
|
|
|
} else {
|
|
|
|
assert(pre_release_cb_status.ok());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (log_context.need_log_sync) {
|
|
|
|
VersionEdit synced_wals;
|
|
|
|
log_write_mutex_.Lock();
|
|
|
|
if (status.ok()) {
|
|
|
|
MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
|
|
|
|
&synced_wals);
|
|
|
|
} else {
|
|
|
|
MarkLogsNotSynced(logfile_number_);
|
|
|
|
}
|
|
|
|
log_write_mutex_.Unlock();
|
|
|
|
if (status.ok() && synced_wals.IsWalAddition()) {
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
status = ApplyWALToManifest(&synced_wals);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Requesting sync with two_write_queues_ is expected to be very rare. We
|
|
|
|
// hence provide a simple implementation that is not necessarily efficient.
|
|
|
|
if (two_write_queues_) {
|
|
|
|
if (manual_wal_flush_) {
|
|
|
|
status = FlushWAL(true);
|
|
|
|
} else {
|
|
|
|
status = SyncWAL();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool should_exit_batch_group = true;
|
|
|
|
if (in_parallel_group) {
|
|
|
|
// CompleteParallelWorker returns true if this thread should
|
|
|
|
// handle exit, false means somebody else did
|
|
|
|
should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
|
|
|
|
}
|
|
|
|
if (should_exit_batch_group) {
|
|
|
|
if (status.ok()) {
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
3 years ago
|
|
|
for (auto* tmp_w : write_group) {
|
|
|
|
assert(tmp_w);
|
|
|
|
if (tmp_w->post_memtable_callback) {
|
|
|
|
Status tmp_s =
|
|
|
|
(*tmp_w->post_memtable_callback)(last_sequence, disable_memtable);
|
|
|
|
// TODO: propagate the execution status of post_memtable_callback to
|
|
|
|
// caller.
|
|
|
|
assert(tmp_s.ok());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Note: if we are to resume after non-OK statuses we need to revisit how
|
|
|
|
// we reacts to non-OK statuses here.
|
|
|
|
versions_->SetLastSequence(last_sequence);
|
|
|
|
}
|
|
|
|
MemTableInsertStatusCheck(w.status);
|
|
|
|
write_thread_.ExitAsBatchGroupLeader(write_group, status);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (status.ok()) {
|
|
|
|
status = w.FinalStatus();
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
|
|
|
|
WriteBatch* my_batch, WriteCallback* callback,
|
|
|
|
uint64_t* log_used, uint64_t log_ref,
|
|
|
|
bool disable_memtable, uint64_t* seq_used) {
|
|
|
|
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
|
|
|
StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
|
|
|
|
|
|
|
|
WriteContext write_context;
|
|
|
|
|
|
|
|
WriteThread::Writer w(write_options, my_batch, callback, log_ref,
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
disable_memtable, /*_batch_cnt=*/0,
|
|
|
|
/*_pre_release_callback=*/nullptr);
|
|
|
|
write_thread_.JoinBatchGroup(&w);
|
|
|
|
TEST_SYNC_POINT("DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup");
|
|
|
|
if (w.state == WriteThread::STATE_GROUP_LEADER) {
|
|
|
|
WriteThread::WriteGroup wal_write_group;
|
|
|
|
if (w.callback && !w.callback->AllowWriteBatching()) {
|
|
|
|
write_thread_.WaitForMemTableWriters();
|
|
|
|
}
|
|
|
|
LogContext log_context(!write_options.disableWAL && write_options.sync);
|
|
|
|
// PreprocessWrite does its own perf timing.
|
|
|
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
|
|
|
w.status = PreprocessWrite(write_options, &log_context, &write_context);
|
|
|
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
|
|
|
|
|
|
|
// This can set non-OK status if callback fail.
|
|
|
|
last_batch_group_size_ =
|
|
|
|
write_thread_.EnterAsBatchGroupLeader(&w, &wal_write_group);
|
|
|
|
const SequenceNumber current_sequence =
|
|
|
|
write_thread_.UpdateLastSequence(versions_->LastSequence()) + 1;
|
|
|
|
size_t total_count = 0;
|
|
|
|
size_t total_byte_size = 0;
|
|
|
|
|
|
|
|
if (w.status.ok()) {
|
|
|
|
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
|
|
|
|
// grabs but does not seem thread-safe.
|
|
|
|
if (tracer_) {
|
|
|
|
InstrumentedMutexLock lock(&trace_mutex_);
|
|
|
|
if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
|
|
|
|
for (auto* writer : wal_write_group) {
|
|
|
|
// TODO: maybe handle the tracing status?
|
|
|
|
tracer_->Write(writer->batch).PermitUncheckedError();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
SequenceNumber next_sequence = current_sequence;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
for (auto* writer : wal_write_group) {
|
|
|
|
assert(writer);
|
|
|
|
if (writer->CheckCallback(this)) {
|
|
|
|
if (writer->ShouldWriteToMemtable()) {
|
|
|
|
writer->sequence = next_sequence;
|
|
|
|
size_t count = WriteBatchInternal::Count(writer->batch);
|
|
|
|
next_sequence += count;
|
|
|
|
total_count += count;
|
|
|
|
}
|
|
|
|
total_byte_size = WriteBatchInternal::AppendedByteSize(
|
|
|
|
total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (w.disable_wal) {
|
|
|
|
has_unpersisted_data_.store(true, std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
write_thread_.UpdateLastSequence(current_sequence + total_count - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
auto stats = default_cf_internal_stats_;
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
|
|
|
|
RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size);
|
|
|
|
RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
|
|
|
|
RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
|
|
|
|
|
|
|
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
IOStatus io_s;
|
|
|
|
io_s.PermitUncheckedError(); // Allow io_s to be uninitialized
|
|
|
|
|
|
|
|
if (w.status.ok() && !write_options.disableWAL) {
|
|
|
|
PERF_TIMER_GUARD(write_wal_time);
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1);
|
|
|
|
RecordTick(stats_, WRITE_DONE_BY_SELF, 1);
|
|
|
|
if (wal_write_group.size > 1) {
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
|
|
|
|
wal_write_group.size - 1);
|
|
|
|
RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
|
|
|
|
}
|
|
|
|
assert(log_context.log_file_number_size);
|
|
|
|
LogFileNumberSize& log_file_number_size =
|
|
|
|
*(log_context.log_file_number_size);
|
|
|
|
io_s =
|
|
|
|
WriteToWAL(wal_write_group, log_context.writer, log_used,
|
|
|
|
log_context.need_log_sync, log_context.need_log_dir_sync,
|
|
|
|
current_sequence, log_file_number_size);
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
w.status = io_s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!io_s.ok()) {
|
|
|
|
// Check WriteToWAL status
|
|
|
|
IOStatusCheck(io_s);
|
|
|
|
} else if (!w.CallbackFailed()) {
|
|
|
|
WriteStatusCheck(w.status);
|
|
|
|
}
|
|
|
|
|
|
|
|
VersionEdit synced_wals;
|
|
|
|
if (log_context.need_log_sync) {
|
|
|
|
InstrumentedMutexLock l(&log_write_mutex_);
|
|
|
|
if (w.status.ok()) {
|
|
|
|
MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
|
|
|
|
&synced_wals);
|
|
|
|
} else {
|
|
|
|
MarkLogsNotSynced(logfile_number_);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (w.status.ok() && synced_wals.IsWalAddition()) {
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
w.status = ApplyWALToManifest(&synced_wals);
|
|
|
|
}
|
|
|
|
write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status);
|
|
|
|
}
|
|
|
|
|
|
|
|
// NOTE: the memtable_write_group is declared before the following
|
|
|
|
// `if` statement because its lifetime needs to be longer
|
|
|
|
// that the inner context of the `if` as a reference to it
|
|
|
|
// may be used further below within the outer _write_thread
|
|
|
|
WriteThread::WriteGroup memtable_write_group;
|
|
|
|
|
|
|
|
if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) {
|
|
|
|
PERF_TIMER_GUARD(write_memtable_time);
|
|
|
|
assert(w.ShouldWriteToMemtable());
|
|
|
|
write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group);
|
|
|
|
if (memtable_write_group.size > 1 &&
|
|
|
|
immutable_db_options_.allow_concurrent_memtable_write) {
|
|
|
|
write_thread_.LaunchParallelMemTableWriters(&memtable_write_group);
|
|
|
|
} else {
|
|
|
|
memtable_write_group.status = WriteBatchInternal::InsertInto(
|
|
|
|
memtable_write_group, w.sequence, column_family_memtables_.get(),
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
&flush_scheduler_, &trim_history_scheduler_,
|
|
|
|
write_options.ignore_missing_column_families, 0 /*log_number*/, this,
|
|
|
|
false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_);
|
|
|
|
versions_->SetLastSequence(memtable_write_group.last_sequence);
|
|
|
|
write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// NOTE: the memtable_write_group is never really used,
|
|
|
|
// so we need to set its status to pass ASSERT_STATUS_CHECKED
|
|
|
|
memtable_write_group.status.PermitUncheckedError();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
|
|
|
|
assert(w.ShouldWriteToMemtable());
|
|
|
|
ColumnFamilyMemTablesImpl column_family_memtables(
|
|
|
|
versions_->GetColumnFamilySet());
|
|
|
|
w.status = WriteBatchInternal::InsertInto(
|
|
|
|
&w, w.sequence, &column_family_memtables, &flush_scheduler_,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
&trim_history_scheduler_, write_options.ignore_missing_column_families,
|
|
|
|
0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
|
|
|
|
false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/,
|
|
|
|
write_options.memtable_insert_hint_per_batch);
|
|
|
|
if (write_thread_.CompleteParallelMemTableWriter(&w)) {
|
|
|
|
MemTableInsertStatusCheck(w.status);
|
|
|
|
versions_->SetLastSequence(w.write_group->last_sequence);
|
|
|
|
write_thread_.ExitAsMemTableWriter(&w, *w.write_group);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (seq_used != nullptr) {
|
|
|
|
*seq_used = w.sequence;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(w.state == WriteThread::STATE_COMPLETED);
|
|
|
|
return w.FinalStatus();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
|
|
|
|
WriteBatch* my_batch,
|
|
|
|
WriteCallback* callback, uint64_t log_ref,
|
|
|
|
SequenceNumber seq,
|
|
|
|
const size_t sub_batch_cnt) {
|
|
|
|
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
|
|
|
StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
|
|
|
|
|
|
|
|
WriteThread::Writer w(write_options, my_batch, callback, log_ref,
|
|
|
|
false /*disable_memtable*/);
|
|
|
|
|
|
|
|
if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) {
|
|
|
|
w.sequence = seq;
|
|
|
|
size_t total_count = WriteBatchInternal::Count(my_batch);
|
|
|
|
InternalStats* stats = default_cf_internal_stats_;
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
|
|
|
|
RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
|
|
|
|
|
|
|
|
ColumnFamilyMemTablesImpl column_family_memtables(
|
|
|
|
versions_->GetColumnFamilySet());
|
|
|
|
w.status = WriteBatchInternal::InsertInto(
|
|
|
|
&w, w.sequence, &column_family_memtables, &flush_scheduler_,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
&trim_history_scheduler_, write_options.ignore_missing_column_families,
|
|
|
|
0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
|
|
|
|
seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/,
|
|
|
|
write_options.memtable_insert_hint_per_batch);
|
|
|
|
if (write_options.disableWAL) {
|
|
|
|
has_unpersisted_data_.store(true, std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1;
|
|
|
|
if (pending_cnt == 0) {
|
|
|
|
// switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex
|
|
|
|
// before notify ensures that cv is in waiting state when it is notified
|
|
|
|
// thus not missing the update to pending_memtable_writes_ even though it is
|
|
|
|
// not modified under the mutex.
|
|
|
|
std::lock_guard<std::mutex> lck(switch_mutex_);
|
|
|
|
switch_cv_.notify_all();
|
|
|
|
}
|
|
|
|
WriteStatusCheck(w.status);
|
|
|
|
|
|
|
|
if (!w.FinalStatus().ok()) {
|
|
|
|
return w.FinalStatus();
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
// The 2nd write queue. If enabled it will be used only for WAL-only writes.
|
|
|
|
// This is the only queue that updates LastPublishedSequence which is only
|
|
|
|
// applicable in a two-queue setting.
|
|
|
|
Status DBImpl::WriteImplWALOnly(
|
|
|
|
WriteThread* write_thread, const WriteOptions& write_options,
|
|
|
|
WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used,
|
|
|
|
const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
|
|
|
|
PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
|
|
|
|
const PublishLastSeq publish_last_seq, const bool disable_memtable) {
|
|
|
|
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
|
|
|
WriteThread::Writer w(write_options, my_batch, callback, log_ref,
|
|
|
|
disable_memtable, sub_batch_cnt, pre_release_callback);
|
|
|
|
StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
|
|
|
|
|
|
|
|
write_thread->JoinBatchGroup(&w);
|
|
|
|
assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
|
|
|
|
if (w.state == WriteThread::STATE_COMPLETED) {
|
|
|
|
if (log_used != nullptr) {
|
|
|
|
*log_used = w.log_used;
|
|
|
|
}
|
|
|
|
if (seq_used != nullptr) {
|
|
|
|
*seq_used = w.sequence;
|
|
|
|
}
|
|
|
|
return w.FinalStatus();
|
|
|
|
}
|
|
|
|
// else we are the leader of the write batch group
|
|
|
|
assert(w.state == WriteThread::STATE_GROUP_LEADER);
|
|
|
|
|
|
|
|
if (publish_last_seq == kDoPublishLastSeq) {
|
|
|
|
Status status;
|
|
|
|
|
|
|
|
// Currently we only use kDoPublishLastSeq in unordered_write
|
|
|
|
assert(immutable_db_options_.unordered_write);
|
|
|
|
WriteContext write_context;
|
|
|
|
if (error_handler_.IsDBStopped()) {
|
|
|
|
status = error_handler_.GetBGError();
|
|
|
|
}
|
|
|
|
// TODO(myabandeh): Make preliminary checks thread-safe so we could do them
|
|
|
|
// without paying the cost of obtaining the mutex.
|
|
|
|
if (status.ok()) {
|
|
|
|
LogContext log_context;
|
|
|
|
status = PreprocessWrite(write_options, &log_context, &write_context);
|
|
|
|
WriteStatusCheckOnLocked(status);
|
|
|
|
}
|
|
|
|
if (!status.ok()) {
|
|
|
|
WriteThread::WriteGroup write_group;
|
|
|
|
write_thread->EnterAsBatchGroupLeader(&w, &write_group);
|
|
|
|
write_thread->ExitAsBatchGroupLeader(write_group, status);
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteThread::WriteGroup write_group;
|
|
|
|
uint64_t last_sequence;
|
|
|
|
write_thread->EnterAsBatchGroupLeader(&w, &write_group);
|
|
|
|
// Note: no need to update last_batch_group_size_ here since the batch writes
|
|
|
|
// to WAL only
|
|
|
|
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
|
|
|
|
// grabs but does not seem thread-safe.
|
|
|
|
if (tracer_) {
|
|
|
|
InstrumentedMutexLock lock(&trace_mutex_);
|
|
|
|
if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
|
|
|
|
for (auto* writer : write_group) {
|
|
|
|
// TODO: maybe handle the tracing status?
|
|
|
|
tracer_->Write(writer->batch).PermitUncheckedError();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t pre_release_callback_cnt = 0;
|
|
|
|
size_t total_byte_size = 0;
|
|
|
|
for (auto* writer : write_group) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
assert(writer);
|
|
|
|
if (writer->CheckCallback(this)) {
|
|
|
|
total_byte_size = WriteBatchInternal::AppendedByteSize(
|
|
|
|
total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
|
|
|
|
if (writer->pre_release_callback) {
|
|
|
|
pre_release_callback_cnt++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const bool concurrent_update = true;
|
|
|
|
// Update stats while we are an exclusive group leader, so we know
|
|
|
|
// that nobody else can be writing to these particular stats.
|
|
|
|
// We're optimistic, updating the stats before we successfully
|
|
|
|
// commit. That lets us release our leader status early.
|
|
|
|
auto stats = default_cf_internal_stats_;
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
|
|
|
|
concurrent_update);
|
|
|
|
RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
|
|
|
|
concurrent_update);
|
|
|
|
RecordTick(stats_, WRITE_DONE_BY_SELF);
|
|
|
|
auto write_done_by_other = write_group.size - 1;
|
|
|
|
if (write_done_by_other > 0) {
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
|
|
|
|
write_done_by_other, concurrent_update);
|
|
|
|
RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
|
|
|
|
}
|
|
|
|
RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
|
|
|
|
|
|
|
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
|
|
|
|
|
|
|
PERF_TIMER_GUARD(write_wal_time);
|
|
|
|
// LastAllocatedSequence is increased inside WriteToWAL under
|
|
|
|
// wal_write_mutex_ to ensure ordered events in WAL
|
|
|
|
size_t seq_inc = 0 /* total_count */;
|
|
|
|
if (assign_order == kDoAssignOrder) {
|
|
|
|
size_t total_batch_cnt = 0;
|
|
|
|
for (auto* writer : write_group) {
|
|
|
|
assert(writer->batch_cnt || !seq_per_batch_);
|
|
|
|
if (!writer->CallbackFailed()) {
|
|
|
|
total_batch_cnt += writer->batch_cnt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
seq_inc = total_batch_cnt;
|
|
|
|
}
|
|
|
|
Status status;
|
|
|
|
if (!write_options.disableWAL) {
|
|
|
|
IOStatus io_s =
|
|
|
|
ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
status = io_s;
|
|
|
|
// last_sequence may not be set if there is an error
|
|
|
|
// This error checking and return is moved up to avoid using uninitialized
|
|
|
|
// last_sequence.
|
|
|
|
if (!io_s.ok()) {
|
|
|
|
IOStatusCheck(io_s);
|
|
|
|
write_thread->ExitAsBatchGroupLeader(write_group, status);
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Otherwise we inc seq number to do solely the seq allocation
|
|
|
|
last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t memtable_write_cnt = 0;
|
|
|
|
auto curr_seq = last_sequence + 1;
|
|
|
|
for (auto* writer : write_group) {
|
|
|
|
if (writer->CallbackFailed()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
writer->sequence = curr_seq;
|
|
|
|
if (assign_order == kDoAssignOrder) {
|
|
|
|
assert(writer->batch_cnt || !seq_per_batch_);
|
|
|
|
curr_seq += writer->batch_cnt;
|
|
|
|
}
|
|
|
|
if (!writer->disable_memtable) {
|
|
|
|
memtable_write_cnt++;
|
|
|
|
}
|
|
|
|
// else seq advances only by memtable writes
|
|
|
|
}
|
|
|
|
if (status.ok() && write_options.sync) {
|
|
|
|
assert(!write_options.disableWAL);
|
|
|
|
// Requesting sync with two_write_queues_ is expected to be very rare. We
|
|
|
|
// hance provide a simple implementation that is not necessarily efficient.
|
|
|
|
if (manual_wal_flush_) {
|
|
|
|
status = FlushWAL(true);
|
|
|
|
} else {
|
|
|
|
status = SyncWAL();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
|
|
|
|
|
|
|
if (!w.CallbackFailed()) {
|
|
|
|
WriteStatusCheck(status);
|
|
|
|
}
|
|
|
|
if (status.ok()) {
|
|
|
|
size_t index = 0;
|
|
|
|
for (auto* writer : write_group) {
|
|
|
|
if (!writer->CallbackFailed() && writer->pre_release_callback) {
|
|
|
|
assert(writer->sequence != kMaxSequenceNumber);
|
|
|
|
Status ws = writer->pre_release_callback->Callback(
|
|
|
|
writer->sequence, disable_memtable, writer->log_used, index++,
|
|
|
|
pre_release_callback_cnt);
|
|
|
|
if (!ws.ok()) {
|
|
|
|
status = ws;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (publish_last_seq == kDoPublishLastSeq) {
|
|
|
|
versions_->SetLastSequence(last_sequence + seq_inc);
|
|
|
|
// Currently we only use kDoPublishLastSeq in unordered_write
|
|
|
|
assert(immutable_db_options_.unordered_write);
|
|
|
|
}
|
|
|
|
if (immutable_db_options_.unordered_write && status.ok()) {
|
|
|
|
pending_memtable_writes_ += memtable_write_cnt;
|
|
|
|
}
|
|
|
|
write_thread->ExitAsBatchGroupLeader(write_group, status);
|
|
|
|
if (status.ok()) {
|
|
|
|
status = w.FinalStatus();
|
|
|
|
}
|
|
|
|
if (seq_used != nullptr) {
|
|
|
|
*seq_used = w.sequence;
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::WriteStatusCheckOnLocked(const Status& status) {
|
|
|
|
// Is setting bg_error_ enough here? This will at least stop
|
|
|
|
// compaction and fail any further writes.
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok());
|
|
|
|
if (immutable_db_options_.paranoid_checks && !status.ok() &&
|
|
|
|
!status.IsBusy() && !status.IsIncomplete()) {
|
|
|
|
// Maybe change the return status to void?
|
|
|
|
error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::WriteStatusCheck(const Status& status) {
|
|
|
|
// Is setting bg_error_ enough here? This will at least stop
|
|
|
|
// compaction and fail any further writes.
|
|
|
|
assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok());
|
|
|
|
if (immutable_db_options_.paranoid_checks && !status.ok() &&
|
|
|
|
!status.IsBusy() && !status.IsIncomplete()) {
|
|
|
|
mutex_.Lock();
|
|
|
|
// Maybe change the return status to void?
|
|
|
|
error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
|
|
|
|
mutex_.Unlock();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
void DBImpl::IOStatusCheck(const IOStatus& io_status) {
|
|
|
|
// Is setting bg_error_ enough here? This will at least stop
|
|
|
|
// compaction and fail any further writes.
|
|
|
|
if ((immutable_db_options_.paranoid_checks && !io_status.ok() &&
|
|
|
|
!io_status.IsBusy() && !io_status.IsIncomplete()) ||
|
|
|
|
io_status.IsIOFenced()) {
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
mutex_.Lock();
|
|
|
|
// Maybe change the return status to void?
|
|
|
|
error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback);
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
mutex_.Unlock();
|
|
|
|
} else {
|
|
|
|
// Force writable file to be continue writable.
|
|
|
|
logs_.back().writer->file()->reset_seen_error();
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::MemTableInsertStatusCheck(const Status& status) {
|
|
|
|
// A non-OK status here indicates that the state implied by the
|
|
|
|
// WAL has diverged from the in-memory state. This could be
|
|
|
|
// because of a corrupt write_batch (very bad), or because the
|
|
|
|
// client specified an invalid column family and didn't specify
|
|
|
|
// ignore_missing_column_families.
|
|
|
|
if (!status.ok()) {
|
|
|
|
mutex_.Lock();
|
|
|
|
assert(!error_handler_.IsBGWorkStopped());
|
|
|
|
// Maybe change the return status to void?
|
|
|
|
error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable)
|
|
|
|
.PermitUncheckedError();
|
|
|
|
mutex_.Unlock();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
|
|
|
LogContext* log_context,
|
|
|
|
WriteContext* write_context) {
|
|
|
|
assert(write_context != nullptr && log_context != nullptr);
|
|
|
|
Status status;
|
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
6 years ago
|
|
|
if (error_handler_.IsDBStopped()) {
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
6 years ago
|
|
|
status = error_handler_.GetBGError();
|
|
|
|
}
|
|
|
|
|
|
|
|
PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
|
|
|
|
|
|
|
|
if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) {
|
|
|
|
assert(versions_);
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
const ColumnFamilySet* const column_families =
|
|
|
|
versions_->GetColumnFamilySet();
|
|
|
|
assert(column_families);
|
|
|
|
size_t num_cfs = column_families->NumberOfColumnFamilies();
|
|
|
|
assert(num_cfs >= 1);
|
|
|
|
if (num_cfs > 1) {
|
|
|
|
WaitForPendingWrites();
|
|
|
|
status = SwitchWAL(write_context);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) {
|
|
|
|
// Before a new memtable is added in SwitchMemtable(),
|
|
|
|
// write_buffer_manager_->ShouldFlush() will keep returning true. If another
|
|
|
|
// thread is writing to another DB with the same write buffer, they may also
|
|
|
|
// be flushed. We may end up with flushing much more DBs than needed. It's
|
|
|
|
// suboptimal but still correct.
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
WaitForPendingWrites();
|
|
|
|
status = HandleWriteBufferManagerFlush(write_context);
|
|
|
|
}
|
|
|
|
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
status = TrimMemtableHistory(write_context);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
WaitForPendingWrites();
|
|
|
|
status = ScheduleFlushes(write_context);
|
|
|
|
}
|
|
|
|
|
|
|
|
PERF_TIMER_STOP(write_scheduling_flushes_compactions_time);
|
|
|
|
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
|
|
|
|
|
|
|
if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
|
|
|
|
write_controller_.NeedsDelay()))) {
|
|
|
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
|
|
|
PERF_TIMER_GUARD(write_delay_time);
|
|
|
|
// We don't know size of curent batch so that we always use the size
|
|
|
|
// for previous one. It might create a fairness issue that expiration
|
|
|
|
// might happen for smaller writes but larger writes can go through.
|
|
|
|
// Can optimize it if it is an issue.
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
status = DelayWrite(last_batch_group_size_, write_options);
|
|
|
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If memory usage exceeded beyond a certain threshold,
|
|
|
|
// write_buffer_manager_->ShouldStall() returns true to all threads writing to
|
|
|
|
// all DBs and writers will be stalled.
|
|
|
|
// It does soft checking because WriteBufferManager::buffer_limit_ has already
|
|
|
|
// exceeded at this point so no new write (including current one) will go
|
|
|
|
// through until memory usage is decreased.
|
|
|
|
if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) {
|
|
|
|
if (write_options.no_slowdown) {
|
|
|
|
status = Status::Incomplete("Write stall");
|
|
|
|
} else {
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
WriteBufferManagerStallWrites();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
InstrumentedMutexLock l(&log_write_mutex_);
|
|
|
|
if (status.ok() && log_context->need_log_sync) {
|
|
|
|
// Wait until the parallel syncs are finished. Any sync process has to sync
|
|
|
|
// the front log too so it is enough to check the status of front()
|
|
|
|
// We do a while loop since log_sync_cv_ is signalled when any sync is
|
|
|
|
// finished
|
|
|
|
// Note: there does not seem to be a reason to wait for parallel sync at
|
|
|
|
// this early step but it is not important since parallel sync (SyncWAL) and
|
|
|
|
// need_log_sync are usually not used together.
|
|
|
|
while (logs_.front().IsSyncing()) {
|
|
|
|
log_sync_cv_.Wait();
|
|
|
|
}
|
|
|
|
for (auto& log : logs_) {
|
|
|
|
// This is just to prevent the logs to be synced by a parallel SyncWAL
|
|
|
|
// call. We will do the actual syncing later after we will write to the
|
|
|
|
// WAL.
|
|
|
|
// Note: there does not seem to be a reason to set this early before we
|
|
|
|
// actually write to the WAL
|
|
|
|
log.PrepareForSync();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
log_context->need_log_sync = false;
|
|
|
|
}
|
|
|
|
log_context->writer = logs_.back().writer;
|
|
|
|
log_context->need_log_dir_sync =
|
|
|
|
log_context->need_log_dir_sync && !log_dir_synced_;
|
|
|
|
log_context->log_file_number_size = std::addressof(alive_log_files_.back());
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
|
|
|
|
WriteBatch* tmp_batch, WriteBatch** merged_batch,
|
|
|
|
size_t* write_with_wal,
|
|
|
|
WriteBatch** to_be_cached_state) {
|
|
|
|
assert(write_with_wal != nullptr);
|
|
|
|
assert(tmp_batch != nullptr);
|
|
|
|
assert(*to_be_cached_state == nullptr);
|
|
|
|
*write_with_wal = 0;
|
|
|
|
auto* leader = write_group.leader;
|
|
|
|
assert(!leader->disable_wal); // Same holds for all in the batch group
|
|
|
|
if (write_group.size == 1 && !leader->CallbackFailed() &&
|
|
|
|
leader->batch->GetWalTerminationPoint().is_cleared()) {
|
|
|
|
// we simply write the first WriteBatch to WAL if the group only
|
|
|
|
// contains one batch, that batch should be written to the WAL,
|
|
|
|
// and the batch is not wanting to be truncated
|
|
|
|
*merged_batch = leader->batch;
|
|
|
|
if (WriteBatchInternal::IsLatestPersistentState(*merged_batch)) {
|
|
|
|
*to_be_cached_state = *merged_batch;
|
|
|
|
}
|
|
|
|
*write_with_wal = 1;
|
|
|
|
} else {
|
|
|
|
// WAL needs all of the batches flattened into a single batch.
|
|
|
|
// We could avoid copying here with an iov-like AddRecord
|
|
|
|
// interface
|
|
|
|
*merged_batch = tmp_batch;
|
|
|
|
for (auto writer : write_group) {
|
|
|
|
if (!writer->CallbackFailed()) {
|
|
|
|
Status s = WriteBatchInternal::Append(*merged_batch, writer->batch,
|
|
|
|
/*WAL_only*/ true);
|
|
|
|
if (!s.ok()) {
|
|
|
|
tmp_batch->Clear();
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) {
|
|
|
|
// We only need to cache the last of such write batch
|
|
|
|
*to_be_cached_state = writer->batch;
|
|
|
|
}
|
|
|
|
(*write_with_wal)++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// return merged_batch;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
// When two_write_queues_ is disabled, this function is called from the only
|
|
|
|
// write thread. Otherwise this must be called holding log_write_mutex_.
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
|
|
|
|
log::Writer* log_writer, uint64_t* log_used,
|
Rate-limit automatic WAL flush after each user write (#9607)
Summary:
**Context:**
WAL flush is currently not rate-limited by `Options::rate_limiter`. This PR is to provide rate-limiting to auto WAL flush, the one that automatically happen after each user write operation (i.e, `Options::manual_wal_flush == false`), by adding `WriteOptions::rate_limiter_options`.
Note that we are NOT rate-limiting WAL flush that do NOT automatically happen after each user write, such as `Options::manual_wal_flush == true + manual FlushWAL()` (rate-limiting multiple WAL flushes), for the benefits of:
- being consistent with [ReadOptions::rate_limiter_priority](https://github.com/facebook/rocksdb/blob/7.0.fb/include/rocksdb/options.h#L515)
- being able to turn off some WAL flush's rate-limiting but not all (e.g, turn off specific the WAL flush of a critical user write like a service's heartbeat)
`WriteOptions::rate_limiter_options` only accept `Env::IO_USER` and `Env::IO_TOTAL` currently due to an implementation constraint.
- The constraint is that we currently queue parallel writes (including WAL writes) based on FIFO policy which does not factor rate limiter priority into this layer's scheduling. If we allow lower priorities such as `Env::IO_HIGH/MID/LOW` and such writes specified with lower priorities occurs before ones specified with higher priorities (even just by a tiny bit in arrival time), the former would have blocked the latter, leading to a "priority inversion" issue and contradictory to what we promise for rate-limiting priority. Therefore we only allow `Env::IO_USER` and `Env::IO_TOTAL` right now before improving that scheduling.
A pre-requisite to this feature is to support operation-level rate limiting in `WritableFileWriter`, which is also included in this PR.
**Summary:**
- Renamed test suite `DBRateLimiterTest to DBRateLimiterOnReadTest` for adding a new test suite
- Accept `rate_limiter_priority` in `WritableFileWriter`'s private and public write functions
- Passed `WriteOptions::rate_limiter_options` to `WritableFileWriter` in the path of automatic WAL flush.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9607
Test Plan:
- Added new unit test to verify existing flush/compaction rate-limiting does not break, since `DBTest, RateLimitingTest` is disabled and current db-level rate-limiting tests focus on read only (e.g, `db_rate_limiter_test`, `DBTest2, RateLimitedCompactionReads`).
- Added new unit test `DBRateLimiterOnWriteWALTest, AutoWalFlush`
- `strace -ftt -e trace=write ./db_bench -benchmarks=fillseq -db=/dev/shm/testdb -rate_limit_auto_wal_flush=1 -rate_limiter_bytes_per_sec=15 -rate_limiter_refill_period_us=1000000 -write_buffer_size=100000000 -disable_auto_compactions=1 -num=100`
- verified that WAL flush(i.e, system-call _write_) were chunked into 15 bytes and each _write_ was roughly 1 second apart
- verified the chunking disappeared when `-rate_limit_auto_wal_flush=0`
- crash test: `python3 tools/db_crashtest.py blackbox --disable_wal=0 --rate_limit_auto_wal_flush=1 --rate_limiter_bytes_per_sec=10485760 --interval=10` killed as normal
**Benchmarked on flush/compaction to ensure no performance regression:**
- compaction with rate-limiting (see table 1, avg over 1280-run): pre-change: **915635 micros/op**; post-change:
**907350 micros/op (improved by 0.106%)**
```
#!/bin/bash
TEST_TMPDIR=/dev/shm/testdb
START=1
NUM_DATA_ENTRY=8
N=10
rm -f compact_bmk_output.txt compact_bmk_output_2.txt dont_care_output.txt
for i in $(eval echo "{$START..$NUM_DATA_ENTRY}")
do
NUM_RUN=$(($N*(2**($i-1))))
for j in $(eval echo "{$START..$NUM_RUN}")
do
./db_bench --benchmarks=fillrandom -db=$TEST_TMPDIR -disable_auto_compactions=1 -write_buffer_size=6710886 > dont_care_output.txt && ./db_bench --benchmarks=compact -use_existing_db=1 -db=$TEST_TMPDIR -level0_file_num_compaction_trigger=1 -rate_limiter_bytes_per_sec=100000000 | egrep 'compact'
done > compact_bmk_output.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' compact_bmk_output.txt >> compact_bmk_output_2.txt
done
```
- compaction w/o rate-limiting (see table 2, avg over 640-run): pre-change: **822197 micros/op**; post-change: **823148 micros/op (regressed by 0.12%)**
```
Same as above script, except that -rate_limiter_bytes_per_sec=0
```
- flush with rate-limiting (see table 3, avg over 320-run, run on the [patch](https://github.com/hx235/rocksdb/commit/ee5c6023a9f6533fab9afdc681568daa21da4953) to augment current db_bench ): pre-change: **745752 micros/op**; post-change: **745331 micros/op (regressed by 0.06 %)**
```
#!/bin/bash
TEST_TMPDIR=/dev/shm/testdb
START=1
NUM_DATA_ENTRY=8
N=10
rm -f flush_bmk_output.txt flush_bmk_output_2.txt
for i in $(eval echo "{$START..$NUM_DATA_ENTRY}")
do
NUM_RUN=$(($N*(2**($i-1))))
for j in $(eval echo "{$START..$NUM_RUN}")
do
./db_bench -db=$TEST_TMPDIR -write_buffer_size=1048576000 -num=1000000 -rate_limiter_bytes_per_sec=100000000 -benchmarks=fillseq,flush | egrep 'flush'
done > flush_bmk_output.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' flush_bmk_output.txt >> flush_bmk_output_2.txt
done
```
- flush w/o rate-limiting (see table 4, avg over 320-run, run on the [patch](https://github.com/hx235/rocksdb/commit/ee5c6023a9f6533fab9afdc681568daa21da4953) to augment current db_bench): pre-change: **487512 micros/op**, post-change: **485856 micors/ops (improved by 0.34%)**
```
Same as above script, except that -rate_limiter_bytes_per_sec=0
```
| table 1 - compact with rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 896978 | 16046.9 | 901242 | 15670.9 | 0.475373978
20 | 893718 | 15813 | 886505 | 17544.7 | -0.8070778478
40 | 900426 | 23882.2 | 894958 | 15104.5 | -0.6072681153
80 | 906635 | 21761.5 | 903332 | 23948.3 | -0.3643141948
160 | 898632 | 21098.9 | 907583 | 21145 | 0.9960695813
3.20E+02 | 905252 | 22785.5 | 908106 | 25325.5 | 0.3152713278
6.40E+02 | 905213 | 23598.6 | 906741 | 21370.5 | 0.1688000504
**1.28E+03** | **908316** | **23533.1** | **907350** | **24626.8** | **-0.1063506533**
average over #-run | 901896.25 | 21064.9625 | 901977.125 | 20592.025 | 0.008967217682
| table 2 - compact w/o rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 811211 | 26996.7 | 807586 | 28456.4 | -0.4468627768
20 | 815465 | 14803.7 | 814608 | 28719.7 | -0.105093413
40 | 809203 | 26187.1 | 797835 | 25492.1 | -1.404839082
80 | 822088 | 28765.3 | 822192 | 32840.4 | 0.01265071379
160 | 821719 | 36344.7 | 821664 | 29544.9 | -0.006693285661
3.20E+02 | 820921 | 27756.4 | 821403 | 28347.7 | 0.05871454135
**6.40E+02** | **822197** | **28960.6** | **823148** | **30055.1** | **0.1156657103**
average over #-run | 8.18E+05 | 2.71E+04 | 8.15E+05 | 2.91E+04 | -0.25
| table 3 - flush with rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 741721 | 11770.8 | 740345 | 5949.76 | -0.1855144994
20 | 735169 | 3561.83 | 743199 | 9755.77 | 1.09226586
40 | 743368 | 8891.03 | 742102 | 8683.22 | -0.1703059588
80 | 742129 | 8148.51 | 743417 | 9631.58| 0.1735547324
160 | 749045 | 9757.21 | 746256 | 9191.86 | -0.3723407806
**3.20E+02** | **745752** | **9819.65** | **745331** | **9840.62** | **-0.0564530836**
6.40E+02 | 749006 | 11080.5 | 748173 | 10578.7 | -0.1112140624
average over #-run | 743741.4286 | 9004.218571 | 744117.5714 | 9090.215714 | 0.05057441238
| table 4 - flush w/o rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 477283 | 24719.6 | 473864 | 12379 | -0.7163464863
20 | 486743 | 20175.2 | 502296 | 23931.3 | 3.195320734
40 | 482846 | 15309.2 | 489820 | 22259.5 | 1.444352858
80 | 491490 | 21883.1 | 490071 | 23085.7 | -0.2887139108
160 | 493347 | 28074.3 | 483609 | 21211.7 | -1.973864238
**3.20E+02** | **487512** | **21401.5** | **485856** | **22195.2** | **-0.3396839462**
6.40E+02 | 490307 | 25418.6 | 485435 | 22405.2 | -0.9936631539
average over #-run | 4.87E+05 | 2.24E+04 | 4.87E+05 | 2.11E+04 | 0.00E+00
Reviewed By: ajkr
Differential Revision: D34442441
Pulled By: hx235
fbshipit-source-id: 4790f13e1e5c0a95ae1d1cc93ffcf69dc6e78bdd
3 years ago
|
|
|
uint64_t* log_size,
|
Fix a TSAN-reported bug caused by concurrent accesss to std::deque (#9686)
Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9686
According to https://www.cplusplus.com/reference/deque/deque/back/,
"
The container is accessed (neither the const nor the non-const versions modify the container).
The last element is potentially accessed or modified by the caller. Concurrently accessing or modifying other elements is safe.
"
Also according to https://www.cplusplus.com/reference/deque/deque/pop_front/,
"
The container is modified.
The first element is modified. Concurrently accessing or modifying other elements is safe (although see iterator validity above).
"
In RocksDB, we never pop the last element of `DBImpl::alive_log_files_`. We have been
exploiting this fact and the above two properties when ensuring correctness when
`DBImpl::alive_log_files_` may be accessed concurrently. Specifically, it can be accessed
in the write path when db mutex is released. Sometimes, the log_mute_ is held. It can also be accessed in `FindObsoleteFiles()`
when db mutex is always held. It can also be accessed
during recovery when db mutex is also held.
Given the fact that we never pop the last element of alive_log_files_, we currently do not
acquire additional locks when accessing it in `WriteToWAL()` as follows
```
alive_log_files_.back().AddSize(log_entry.size());
```
This is problematic.
Check source code of deque.h
```
back() _GLIBCXX_NOEXCEPT
{
__glibcxx_requires_nonempty();
...
}
pop_front() _GLIBCXX_NOEXCEPT
{
...
if (this->_M_impl._M_start._M_cur
!= this->_M_impl._M_start._M_last - 1)
{
...
++this->_M_impl._M_start._M_cur;
}
...
}
```
`back()` will actually call `__glibcxx_requires_nonempty()` first.
If `__glibcxx_requires_nonempty()` is enabled and not an empty macro,
it will call `empty()`
```
bool empty() {
return this->_M_impl._M_finish == this->_M_impl._M_start;
}
```
You can see that it will access `this->_M_impl._M_start`, racing with `pop_front()`.
Therefore, TSAN will actually catch the bug in this case.
To be able to use TSAN on our library and unit tests, we should always coordinate
concurrent accesses to STL containers properly.
We need to pass information about db mutex and log mutex into `WriteToWAL()`, otherwise
it's impossible to know which mutex to acquire inside the function.
To fix this, we can catch the tail of `alive_log_files_` by reference, so that we do not have to call `back()` in `WriteToWAL()`.
Reviewed By: pdillinger
Differential Revision: D34780309
fbshipit-source-id: 1def9821f0c437f2736c6a26445d75890377889b
3 years ago
|
|
|
Env::IOPriority rate_limiter_priority,
|
|
|
|
LogFileNumberSize& log_file_number_size) {
|
|
|
|
assert(log_size != nullptr);
|
Fix a TSAN-reported bug caused by concurrent accesss to std::deque (#9686)
Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9686
According to https://www.cplusplus.com/reference/deque/deque/back/,
"
The container is accessed (neither the const nor the non-const versions modify the container).
The last element is potentially accessed or modified by the caller. Concurrently accessing or modifying other elements is safe.
"
Also according to https://www.cplusplus.com/reference/deque/deque/pop_front/,
"
The container is modified.
The first element is modified. Concurrently accessing or modifying other elements is safe (although see iterator validity above).
"
In RocksDB, we never pop the last element of `DBImpl::alive_log_files_`. We have been
exploiting this fact and the above two properties when ensuring correctness when
`DBImpl::alive_log_files_` may be accessed concurrently. Specifically, it can be accessed
in the write path when db mutex is released. Sometimes, the log_mute_ is held. It can also be accessed in `FindObsoleteFiles()`
when db mutex is always held. It can also be accessed
during recovery when db mutex is also held.
Given the fact that we never pop the last element of alive_log_files_, we currently do not
acquire additional locks when accessing it in `WriteToWAL()` as follows
```
alive_log_files_.back().AddSize(log_entry.size());
```
This is problematic.
Check source code of deque.h
```
back() _GLIBCXX_NOEXCEPT
{
__glibcxx_requires_nonempty();
...
}
pop_front() _GLIBCXX_NOEXCEPT
{
...
if (this->_M_impl._M_start._M_cur
!= this->_M_impl._M_start._M_last - 1)
{
...
++this->_M_impl._M_start._M_cur;
}
...
}
```
`back()` will actually call `__glibcxx_requires_nonempty()` first.
If `__glibcxx_requires_nonempty()` is enabled and not an empty macro,
it will call `empty()`
```
bool empty() {
return this->_M_impl._M_finish == this->_M_impl._M_start;
}
```
You can see that it will access `this->_M_impl._M_start`, racing with `pop_front()`.
Therefore, TSAN will actually catch the bug in this case.
To be able to use TSAN on our library and unit tests, we should always coordinate
concurrent accesses to STL containers properly.
We need to pass information about db mutex and log mutex into `WriteToWAL()`, otherwise
it's impossible to know which mutex to acquire inside the function.
To fix this, we can catch the tail of `alive_log_files_` by reference, so that we do not have to call `back()` in `WriteToWAL()`.
Reviewed By: pdillinger
Differential Revision: D34780309
fbshipit-source-id: 1def9821f0c437f2736c6a26445d75890377889b
3 years ago
|
|
|
|
|
|
|
Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
|
|
|
|
TEST_SYNC_POINT_CALLBACK("DBImpl::WriteToWAL:log_entry", &log_entry);
|
|
|
|
auto s = merged_batch.VerifyChecksum();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return status_to_io_status(std::move(s));
|
|
|
|
}
|
|
|
|
*log_size = log_entry.size();
|
|
|
|
// When two_write_queues_ WriteToWAL has to be protected from concurretn calls
|
|
|
|
// from the two queues anyway and log_write_mutex_ is already held. Otherwise
|
|
|
|
// if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
|
|
|
|
// from possible concurrent calls via the FlushWAL by the application.
|
|
|
|
const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
|
|
|
|
// Due to performance cocerns of missed branch prediction penalize the new
|
|
|
|
// manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
|
|
|
|
// when we do not need any locking.
|
|
|
|
if (UNLIKELY(needs_locking)) {
|
|
|
|
log_write_mutex_.Lock();
|
|
|
|
}
|
Rate-limit automatic WAL flush after each user write (#9607)
Summary:
**Context:**
WAL flush is currently not rate-limited by `Options::rate_limiter`. This PR is to provide rate-limiting to auto WAL flush, the one that automatically happen after each user write operation (i.e, `Options::manual_wal_flush == false`), by adding `WriteOptions::rate_limiter_options`.
Note that we are NOT rate-limiting WAL flush that do NOT automatically happen after each user write, such as `Options::manual_wal_flush == true + manual FlushWAL()` (rate-limiting multiple WAL flushes), for the benefits of:
- being consistent with [ReadOptions::rate_limiter_priority](https://github.com/facebook/rocksdb/blob/7.0.fb/include/rocksdb/options.h#L515)
- being able to turn off some WAL flush's rate-limiting but not all (e.g, turn off specific the WAL flush of a critical user write like a service's heartbeat)
`WriteOptions::rate_limiter_options` only accept `Env::IO_USER` and `Env::IO_TOTAL` currently due to an implementation constraint.
- The constraint is that we currently queue parallel writes (including WAL writes) based on FIFO policy which does not factor rate limiter priority into this layer's scheduling. If we allow lower priorities such as `Env::IO_HIGH/MID/LOW` and such writes specified with lower priorities occurs before ones specified with higher priorities (even just by a tiny bit in arrival time), the former would have blocked the latter, leading to a "priority inversion" issue and contradictory to what we promise for rate-limiting priority. Therefore we only allow `Env::IO_USER` and `Env::IO_TOTAL` right now before improving that scheduling.
A pre-requisite to this feature is to support operation-level rate limiting in `WritableFileWriter`, which is also included in this PR.
**Summary:**
- Renamed test suite `DBRateLimiterTest to DBRateLimiterOnReadTest` for adding a new test suite
- Accept `rate_limiter_priority` in `WritableFileWriter`'s private and public write functions
- Passed `WriteOptions::rate_limiter_options` to `WritableFileWriter` in the path of automatic WAL flush.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9607
Test Plan:
- Added new unit test to verify existing flush/compaction rate-limiting does not break, since `DBTest, RateLimitingTest` is disabled and current db-level rate-limiting tests focus on read only (e.g, `db_rate_limiter_test`, `DBTest2, RateLimitedCompactionReads`).
- Added new unit test `DBRateLimiterOnWriteWALTest, AutoWalFlush`
- `strace -ftt -e trace=write ./db_bench -benchmarks=fillseq -db=/dev/shm/testdb -rate_limit_auto_wal_flush=1 -rate_limiter_bytes_per_sec=15 -rate_limiter_refill_period_us=1000000 -write_buffer_size=100000000 -disable_auto_compactions=1 -num=100`
- verified that WAL flush(i.e, system-call _write_) were chunked into 15 bytes and each _write_ was roughly 1 second apart
- verified the chunking disappeared when `-rate_limit_auto_wal_flush=0`
- crash test: `python3 tools/db_crashtest.py blackbox --disable_wal=0 --rate_limit_auto_wal_flush=1 --rate_limiter_bytes_per_sec=10485760 --interval=10` killed as normal
**Benchmarked on flush/compaction to ensure no performance regression:**
- compaction with rate-limiting (see table 1, avg over 1280-run): pre-change: **915635 micros/op**; post-change:
**907350 micros/op (improved by 0.106%)**
```
#!/bin/bash
TEST_TMPDIR=/dev/shm/testdb
START=1
NUM_DATA_ENTRY=8
N=10
rm -f compact_bmk_output.txt compact_bmk_output_2.txt dont_care_output.txt
for i in $(eval echo "{$START..$NUM_DATA_ENTRY}")
do
NUM_RUN=$(($N*(2**($i-1))))
for j in $(eval echo "{$START..$NUM_RUN}")
do
./db_bench --benchmarks=fillrandom -db=$TEST_TMPDIR -disable_auto_compactions=1 -write_buffer_size=6710886 > dont_care_output.txt && ./db_bench --benchmarks=compact -use_existing_db=1 -db=$TEST_TMPDIR -level0_file_num_compaction_trigger=1 -rate_limiter_bytes_per_sec=100000000 | egrep 'compact'
done > compact_bmk_output.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' compact_bmk_output.txt >> compact_bmk_output_2.txt
done
```
- compaction w/o rate-limiting (see table 2, avg over 640-run): pre-change: **822197 micros/op**; post-change: **823148 micros/op (regressed by 0.12%)**
```
Same as above script, except that -rate_limiter_bytes_per_sec=0
```
- flush with rate-limiting (see table 3, avg over 320-run, run on the [patch](https://github.com/hx235/rocksdb/commit/ee5c6023a9f6533fab9afdc681568daa21da4953) to augment current db_bench ): pre-change: **745752 micros/op**; post-change: **745331 micros/op (regressed by 0.06 %)**
```
#!/bin/bash
TEST_TMPDIR=/dev/shm/testdb
START=1
NUM_DATA_ENTRY=8
N=10
rm -f flush_bmk_output.txt flush_bmk_output_2.txt
for i in $(eval echo "{$START..$NUM_DATA_ENTRY}")
do
NUM_RUN=$(($N*(2**($i-1))))
for j in $(eval echo "{$START..$NUM_RUN}")
do
./db_bench -db=$TEST_TMPDIR -write_buffer_size=1048576000 -num=1000000 -rate_limiter_bytes_per_sec=100000000 -benchmarks=fillseq,flush | egrep 'flush'
done > flush_bmk_output.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' flush_bmk_output.txt >> flush_bmk_output_2.txt
done
```
- flush w/o rate-limiting (see table 4, avg over 320-run, run on the [patch](https://github.com/hx235/rocksdb/commit/ee5c6023a9f6533fab9afdc681568daa21da4953) to augment current db_bench): pre-change: **487512 micros/op**, post-change: **485856 micors/ops (improved by 0.34%)**
```
Same as above script, except that -rate_limiter_bytes_per_sec=0
```
| table 1 - compact with rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 896978 | 16046.9 | 901242 | 15670.9 | 0.475373978
20 | 893718 | 15813 | 886505 | 17544.7 | -0.8070778478
40 | 900426 | 23882.2 | 894958 | 15104.5 | -0.6072681153
80 | 906635 | 21761.5 | 903332 | 23948.3 | -0.3643141948
160 | 898632 | 21098.9 | 907583 | 21145 | 0.9960695813
3.20E+02 | 905252 | 22785.5 | 908106 | 25325.5 | 0.3152713278
6.40E+02 | 905213 | 23598.6 | 906741 | 21370.5 | 0.1688000504
**1.28E+03** | **908316** | **23533.1** | **907350** | **24626.8** | **-0.1063506533**
average over #-run | 901896.25 | 21064.9625 | 901977.125 | 20592.025 | 0.008967217682
| table 2 - compact w/o rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 811211 | 26996.7 | 807586 | 28456.4 | -0.4468627768
20 | 815465 | 14803.7 | 814608 | 28719.7 | -0.105093413
40 | 809203 | 26187.1 | 797835 | 25492.1 | -1.404839082
80 | 822088 | 28765.3 | 822192 | 32840.4 | 0.01265071379
160 | 821719 | 36344.7 | 821664 | 29544.9 | -0.006693285661
3.20E+02 | 820921 | 27756.4 | 821403 | 28347.7 | 0.05871454135
**6.40E+02** | **822197** | **28960.6** | **823148** | **30055.1** | **0.1156657103**
average over #-run | 8.18E+05 | 2.71E+04 | 8.15E+05 | 2.91E+04 | -0.25
| table 3 - flush with rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 741721 | 11770.8 | 740345 | 5949.76 | -0.1855144994
20 | 735169 | 3561.83 | 743199 | 9755.77 | 1.09226586
40 | 743368 | 8891.03 | 742102 | 8683.22 | -0.1703059588
80 | 742129 | 8148.51 | 743417 | 9631.58| 0.1735547324
160 | 749045 | 9757.21 | 746256 | 9191.86 | -0.3723407806
**3.20E+02** | **745752** | **9819.65** | **745331** | **9840.62** | **-0.0564530836**
6.40E+02 | 749006 | 11080.5 | 748173 | 10578.7 | -0.1112140624
average over #-run | 743741.4286 | 9004.218571 | 744117.5714 | 9090.215714 | 0.05057441238
| table 4 - flush w/o rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 477283 | 24719.6 | 473864 | 12379 | -0.7163464863
20 | 486743 | 20175.2 | 502296 | 23931.3 | 3.195320734
40 | 482846 | 15309.2 | 489820 | 22259.5 | 1.444352858
80 | 491490 | 21883.1 | 490071 | 23085.7 | -0.2887139108
160 | 493347 | 28074.3 | 483609 | 21211.7 | -1.973864238
**3.20E+02** | **487512** | **21401.5** | **485856** | **22195.2** | **-0.3396839462**
6.40E+02 | 490307 | 25418.6 | 485435 | 22405.2 | -0.9936631539
average over #-run | 4.87E+05 | 2.24E+04 | 4.87E+05 | 2.11E+04 | 0.00E+00
Reviewed By: ajkr
Differential Revision: D34442441
Pulled By: hx235
fbshipit-source-id: 4790f13e1e5c0a95ae1d1cc93ffcf69dc6e78bdd
3 years ago
|
|
|
IOStatus io_s = log_writer->AddRecord(log_entry, rate_limiter_priority);
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
|
|
|
|
if (UNLIKELY(needs_locking)) {
|
|
|
|
log_write_mutex_.Unlock();
|
|
|
|
}
|
|
|
|
if (log_used != nullptr) {
|
|
|
|
*log_used = logfile_number_;
|
|
|
|
}
|
|
|
|
total_log_size_ += log_entry.size();
|
|
|
|
log_file_number_size.AddSize(*log_size);
|
|
|
|
log_empty_ = false;
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
return io_s;
|
|
|
|
}
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
|
|
|
|
log::Writer* log_writer, uint64_t* log_used,
|
|
|
|
bool need_log_sync, bool need_log_dir_sync,
|
|
|
|
SequenceNumber sequence,
|
|
|
|
LogFileNumberSize& log_file_number_size) {
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
IOStatus io_s;
|
Fix a TSAN-reported bug caused by concurrent accesss to std::deque (#9686)
Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9686
According to https://www.cplusplus.com/reference/deque/deque/back/,
"
The container is accessed (neither the const nor the non-const versions modify the container).
The last element is potentially accessed or modified by the caller. Concurrently accessing or modifying other elements is safe.
"
Also according to https://www.cplusplus.com/reference/deque/deque/pop_front/,
"
The container is modified.
The first element is modified. Concurrently accessing or modifying other elements is safe (although see iterator validity above).
"
In RocksDB, we never pop the last element of `DBImpl::alive_log_files_`. We have been
exploiting this fact and the above two properties when ensuring correctness when
`DBImpl::alive_log_files_` may be accessed concurrently. Specifically, it can be accessed
in the write path when db mutex is released. Sometimes, the log_mute_ is held. It can also be accessed in `FindObsoleteFiles()`
when db mutex is always held. It can also be accessed
during recovery when db mutex is also held.
Given the fact that we never pop the last element of alive_log_files_, we currently do not
acquire additional locks when accessing it in `WriteToWAL()` as follows
```
alive_log_files_.back().AddSize(log_entry.size());
```
This is problematic.
Check source code of deque.h
```
back() _GLIBCXX_NOEXCEPT
{
__glibcxx_requires_nonempty();
...
}
pop_front() _GLIBCXX_NOEXCEPT
{
...
if (this->_M_impl._M_start._M_cur
!= this->_M_impl._M_start._M_last - 1)
{
...
++this->_M_impl._M_start._M_cur;
}
...
}
```
`back()` will actually call `__glibcxx_requires_nonempty()` first.
If `__glibcxx_requires_nonempty()` is enabled and not an empty macro,
it will call `empty()`
```
bool empty() {
return this->_M_impl._M_finish == this->_M_impl._M_start;
}
```
You can see that it will access `this->_M_impl._M_start`, racing with `pop_front()`.
Therefore, TSAN will actually catch the bug in this case.
To be able to use TSAN on our library and unit tests, we should always coordinate
concurrent accesses to STL containers properly.
We need to pass information about db mutex and log mutex into `WriteToWAL()`, otherwise
it's impossible to know which mutex to acquire inside the function.
To fix this, we can catch the tail of `alive_log_files_` by reference, so that we do not have to call `back()` in `WriteToWAL()`.
Reviewed By: pdillinger
Differential Revision: D34780309
fbshipit-source-id: 1def9821f0c437f2736c6a26445d75890377889b
3 years ago
|
|
|
assert(!two_write_queues_);
|
|
|
|
assert(!write_group.leader->disable_wal);
|
|
|
|
// Same holds for all in the batch group
|
|
|
|
size_t write_with_wal = 0;
|
|
|
|
WriteBatch* to_be_cached_state = nullptr;
|
|
|
|
WriteBatch* merged_batch;
|
|
|
|
io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch_, &merged_batch,
|
|
|
|
&write_with_wal, &to_be_cached_state));
|
|
|
|
if (UNLIKELY(!io_s.ok())) {
|
|
|
|
return io_s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (merged_batch == write_group.leader->batch) {
|
|
|
|
write_group.leader->log_used = logfile_number_;
|
|
|
|
} else if (write_with_wal > 1) {
|
|
|
|
for (auto writer : write_group) {
|
|
|
|
writer->log_used = logfile_number_;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatchInternal::SetSequence(merged_batch, sequence);
|
|
|
|
|
|
|
|
uint64_t log_size;
|
Rate-limit automatic WAL flush after each user write (#9607)
Summary:
**Context:**
WAL flush is currently not rate-limited by `Options::rate_limiter`. This PR is to provide rate-limiting to auto WAL flush, the one that automatically happen after each user write operation (i.e, `Options::manual_wal_flush == false`), by adding `WriteOptions::rate_limiter_options`.
Note that we are NOT rate-limiting WAL flush that do NOT automatically happen after each user write, such as `Options::manual_wal_flush == true + manual FlushWAL()` (rate-limiting multiple WAL flushes), for the benefits of:
- being consistent with [ReadOptions::rate_limiter_priority](https://github.com/facebook/rocksdb/blob/7.0.fb/include/rocksdb/options.h#L515)
- being able to turn off some WAL flush's rate-limiting but not all (e.g, turn off specific the WAL flush of a critical user write like a service's heartbeat)
`WriteOptions::rate_limiter_options` only accept `Env::IO_USER` and `Env::IO_TOTAL` currently due to an implementation constraint.
- The constraint is that we currently queue parallel writes (including WAL writes) based on FIFO policy which does not factor rate limiter priority into this layer's scheduling. If we allow lower priorities such as `Env::IO_HIGH/MID/LOW` and such writes specified with lower priorities occurs before ones specified with higher priorities (even just by a tiny bit in arrival time), the former would have blocked the latter, leading to a "priority inversion" issue and contradictory to what we promise for rate-limiting priority. Therefore we only allow `Env::IO_USER` and `Env::IO_TOTAL` right now before improving that scheduling.
A pre-requisite to this feature is to support operation-level rate limiting in `WritableFileWriter`, which is also included in this PR.
**Summary:**
- Renamed test suite `DBRateLimiterTest to DBRateLimiterOnReadTest` for adding a new test suite
- Accept `rate_limiter_priority` in `WritableFileWriter`'s private and public write functions
- Passed `WriteOptions::rate_limiter_options` to `WritableFileWriter` in the path of automatic WAL flush.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9607
Test Plan:
- Added new unit test to verify existing flush/compaction rate-limiting does not break, since `DBTest, RateLimitingTest` is disabled and current db-level rate-limiting tests focus on read only (e.g, `db_rate_limiter_test`, `DBTest2, RateLimitedCompactionReads`).
- Added new unit test `DBRateLimiterOnWriteWALTest, AutoWalFlush`
- `strace -ftt -e trace=write ./db_bench -benchmarks=fillseq -db=/dev/shm/testdb -rate_limit_auto_wal_flush=1 -rate_limiter_bytes_per_sec=15 -rate_limiter_refill_period_us=1000000 -write_buffer_size=100000000 -disable_auto_compactions=1 -num=100`
- verified that WAL flush(i.e, system-call _write_) were chunked into 15 bytes and each _write_ was roughly 1 second apart
- verified the chunking disappeared when `-rate_limit_auto_wal_flush=0`
- crash test: `python3 tools/db_crashtest.py blackbox --disable_wal=0 --rate_limit_auto_wal_flush=1 --rate_limiter_bytes_per_sec=10485760 --interval=10` killed as normal
**Benchmarked on flush/compaction to ensure no performance regression:**
- compaction with rate-limiting (see table 1, avg over 1280-run): pre-change: **915635 micros/op**; post-change:
**907350 micros/op (improved by 0.106%)**
```
#!/bin/bash
TEST_TMPDIR=/dev/shm/testdb
START=1
NUM_DATA_ENTRY=8
N=10
rm -f compact_bmk_output.txt compact_bmk_output_2.txt dont_care_output.txt
for i in $(eval echo "{$START..$NUM_DATA_ENTRY}")
do
NUM_RUN=$(($N*(2**($i-1))))
for j in $(eval echo "{$START..$NUM_RUN}")
do
./db_bench --benchmarks=fillrandom -db=$TEST_TMPDIR -disable_auto_compactions=1 -write_buffer_size=6710886 > dont_care_output.txt && ./db_bench --benchmarks=compact -use_existing_db=1 -db=$TEST_TMPDIR -level0_file_num_compaction_trigger=1 -rate_limiter_bytes_per_sec=100000000 | egrep 'compact'
done > compact_bmk_output.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' compact_bmk_output.txt >> compact_bmk_output_2.txt
done
```
- compaction w/o rate-limiting (see table 2, avg over 640-run): pre-change: **822197 micros/op**; post-change: **823148 micros/op (regressed by 0.12%)**
```
Same as above script, except that -rate_limiter_bytes_per_sec=0
```
- flush with rate-limiting (see table 3, avg over 320-run, run on the [patch](https://github.com/hx235/rocksdb/commit/ee5c6023a9f6533fab9afdc681568daa21da4953) to augment current db_bench ): pre-change: **745752 micros/op**; post-change: **745331 micros/op (regressed by 0.06 %)**
```
#!/bin/bash
TEST_TMPDIR=/dev/shm/testdb
START=1
NUM_DATA_ENTRY=8
N=10
rm -f flush_bmk_output.txt flush_bmk_output_2.txt
for i in $(eval echo "{$START..$NUM_DATA_ENTRY}")
do
NUM_RUN=$(($N*(2**($i-1))))
for j in $(eval echo "{$START..$NUM_RUN}")
do
./db_bench -db=$TEST_TMPDIR -write_buffer_size=1048576000 -num=1000000 -rate_limiter_bytes_per_sec=100000000 -benchmarks=fillseq,flush | egrep 'flush'
done > flush_bmk_output.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' flush_bmk_output.txt >> flush_bmk_output_2.txt
done
```
- flush w/o rate-limiting (see table 4, avg over 320-run, run on the [patch](https://github.com/hx235/rocksdb/commit/ee5c6023a9f6533fab9afdc681568daa21da4953) to augment current db_bench): pre-change: **487512 micros/op**, post-change: **485856 micors/ops (improved by 0.34%)**
```
Same as above script, except that -rate_limiter_bytes_per_sec=0
```
| table 1 - compact with rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 896978 | 16046.9 | 901242 | 15670.9 | 0.475373978
20 | 893718 | 15813 | 886505 | 17544.7 | -0.8070778478
40 | 900426 | 23882.2 | 894958 | 15104.5 | -0.6072681153
80 | 906635 | 21761.5 | 903332 | 23948.3 | -0.3643141948
160 | 898632 | 21098.9 | 907583 | 21145 | 0.9960695813
3.20E+02 | 905252 | 22785.5 | 908106 | 25325.5 | 0.3152713278
6.40E+02 | 905213 | 23598.6 | 906741 | 21370.5 | 0.1688000504
**1.28E+03** | **908316** | **23533.1** | **907350** | **24626.8** | **-0.1063506533**
average over #-run | 901896.25 | 21064.9625 | 901977.125 | 20592.025 | 0.008967217682
| table 2 - compact w/o rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 811211 | 26996.7 | 807586 | 28456.4 | -0.4468627768
20 | 815465 | 14803.7 | 814608 | 28719.7 | -0.105093413
40 | 809203 | 26187.1 | 797835 | 25492.1 | -1.404839082
80 | 822088 | 28765.3 | 822192 | 32840.4 | 0.01265071379
160 | 821719 | 36344.7 | 821664 | 29544.9 | -0.006693285661
3.20E+02 | 820921 | 27756.4 | 821403 | 28347.7 | 0.05871454135
**6.40E+02** | **822197** | **28960.6** | **823148** | **30055.1** | **0.1156657103**
average over #-run | 8.18E+05 | 2.71E+04 | 8.15E+05 | 2.91E+04 | -0.25
| table 3 - flush with rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 741721 | 11770.8 | 740345 | 5949.76 | -0.1855144994
20 | 735169 | 3561.83 | 743199 | 9755.77 | 1.09226586
40 | 743368 | 8891.03 | 742102 | 8683.22 | -0.1703059588
80 | 742129 | 8148.51 | 743417 | 9631.58| 0.1735547324
160 | 749045 | 9757.21 | 746256 | 9191.86 | -0.3723407806
**3.20E+02** | **745752** | **9819.65** | **745331** | **9840.62** | **-0.0564530836**
6.40E+02 | 749006 | 11080.5 | 748173 | 10578.7 | -0.1112140624
average over #-run | 743741.4286 | 9004.218571 | 744117.5714 | 9090.215714 | 0.05057441238
| table 4 - flush w/o rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 477283 | 24719.6 | 473864 | 12379 | -0.7163464863
20 | 486743 | 20175.2 | 502296 | 23931.3 | 3.195320734
40 | 482846 | 15309.2 | 489820 | 22259.5 | 1.444352858
80 | 491490 | 21883.1 | 490071 | 23085.7 | -0.2887139108
160 | 493347 | 28074.3 | 483609 | 21211.7 | -1.973864238
**3.20E+02** | **487512** | **21401.5** | **485856** | **22195.2** | **-0.3396839462**
6.40E+02 | 490307 | 25418.6 | 485435 | 22405.2 | -0.9936631539
average over #-run | 4.87E+05 | 2.24E+04 | 4.87E+05 | 2.11E+04 | 0.00E+00
Reviewed By: ajkr
Differential Revision: D34442441
Pulled By: hx235
fbshipit-source-id: 4790f13e1e5c0a95ae1d1cc93ffcf69dc6e78bdd
3 years ago
|
|
|
io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size,
|
|
|
|
write_group.leader->rate_limiter_priority,
|
|
|
|
log_file_number_size);
|
|
|
|
if (to_be_cached_state) {
|
|
|
|
cached_recoverable_state_ = *to_be_cached_state;
|
|
|
|
cached_recoverable_state_empty_ = false;
|
|
|
|
}
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
if (io_s.ok() && need_log_sync) {
|
|
|
|
StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS);
|
|
|
|
// It's safe to access logs_ with unlocked mutex_ here because:
|
|
|
|
// - we've set getting_synced=true for all logs,
|
|
|
|
// so other threads won't pop from logs_ while we're here,
|
|
|
|
// - only writer thread can push to logs_, and we're in
|
|
|
|
// writer thread, so no one will push to logs_,
|
|
|
|
// - as long as other threads don't modify it, it's safe to read
|
|
|
|
// from std::deque from multiple threads concurrently.
|
|
|
|
//
|
|
|
|
// Sync operation should work with locked log_write_mutex_, because:
|
|
|
|
// when DBOptions.manual_wal_flush_ is set,
|
|
|
|
// FlushWAL function will be invoked by another thread.
|
|
|
|
// if without locked log_write_mutex_, the log file may get data
|
|
|
|
// corruption
|
|
|
|
|
|
|
|
const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
|
|
|
|
if (UNLIKELY(needs_locking)) {
|
|
|
|
log_write_mutex_.Lock();
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto& log : logs_) {
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync);
|
|
|
|
if (!io_s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
|
|
|
|
if (UNLIKELY(needs_locking)) {
|
|
|
|
log_write_mutex_.Unlock();
|
|
|
|
}
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
if (io_s.ok() && need_log_dir_sync) {
|
|
|
|
// We only sync WAL directory the first time WAL syncing is
|
|
|
|
// requested, so that in case users never turn on WAL sync,
|
|
|
|
// we can avoid the disk I/O in the write code path.
|
|
|
|
io_s = directories_.GetWalDir()->FsyncWithDirOptions(
|
|
|
|
IOOptions(), nullptr,
|
|
|
|
DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (merged_batch == &tmp_batch_) {
|
|
|
|
tmp_batch_.Clear();
|
|
|
|
}
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
if (io_s.ok()) {
|
|
|
|
auto stats = default_cf_internal_stats_;
|
|
|
|
if (need_log_sync) {
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
|
|
|
|
RecordTick(stats_, WAL_FILE_SYNCED);
|
|
|
|
}
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size);
|
|
|
|
RecordTick(stats_, WAL_FILE_BYTES, log_size);
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal);
|
|
|
|
RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
|
|
|
|
}
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
return io_s;
|
|
|
|
}
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
IOStatus DBImpl::ConcurrentWriteToWAL(
|
|
|
|
const WriteThread::WriteGroup& write_group, uint64_t* log_used,
|
|
|
|
SequenceNumber* last_sequence, size_t seq_inc) {
|
|
|
|
IOStatus io_s;
|
|
|
|
|
Fix a TSAN-reported bug caused by concurrent accesss to std::deque (#9686)
Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9686
According to https://www.cplusplus.com/reference/deque/deque/back/,
"
The container is accessed (neither the const nor the non-const versions modify the container).
The last element is potentially accessed or modified by the caller. Concurrently accessing or modifying other elements is safe.
"
Also according to https://www.cplusplus.com/reference/deque/deque/pop_front/,
"
The container is modified.
The first element is modified. Concurrently accessing or modifying other elements is safe (although see iterator validity above).
"
In RocksDB, we never pop the last element of `DBImpl::alive_log_files_`. We have been
exploiting this fact and the above two properties when ensuring correctness when
`DBImpl::alive_log_files_` may be accessed concurrently. Specifically, it can be accessed
in the write path when db mutex is released. Sometimes, the log_mute_ is held. It can also be accessed in `FindObsoleteFiles()`
when db mutex is always held. It can also be accessed
during recovery when db mutex is also held.
Given the fact that we never pop the last element of alive_log_files_, we currently do not
acquire additional locks when accessing it in `WriteToWAL()` as follows
```
alive_log_files_.back().AddSize(log_entry.size());
```
This is problematic.
Check source code of deque.h
```
back() _GLIBCXX_NOEXCEPT
{
__glibcxx_requires_nonempty();
...
}
pop_front() _GLIBCXX_NOEXCEPT
{
...
if (this->_M_impl._M_start._M_cur
!= this->_M_impl._M_start._M_last - 1)
{
...
++this->_M_impl._M_start._M_cur;
}
...
}
```
`back()` will actually call `__glibcxx_requires_nonempty()` first.
If `__glibcxx_requires_nonempty()` is enabled and not an empty macro,
it will call `empty()`
```
bool empty() {
return this->_M_impl._M_finish == this->_M_impl._M_start;
}
```
You can see that it will access `this->_M_impl._M_start`, racing with `pop_front()`.
Therefore, TSAN will actually catch the bug in this case.
To be able to use TSAN on our library and unit tests, we should always coordinate
concurrent accesses to STL containers properly.
We need to pass information about db mutex and log mutex into `WriteToWAL()`, otherwise
it's impossible to know which mutex to acquire inside the function.
To fix this, we can catch the tail of `alive_log_files_` by reference, so that we do not have to call `back()` in `WriteToWAL()`.
Reviewed By: pdillinger
Differential Revision: D34780309
fbshipit-source-id: 1def9821f0c437f2736c6a26445d75890377889b
3 years ago
|
|
|
assert(two_write_queues_ || immutable_db_options_.unordered_write);
|
|
|
|
assert(!write_group.leader->disable_wal);
|
|
|
|
// Same holds for all in the batch group
|
|
|
|
WriteBatch tmp_batch;
|
|
|
|
size_t write_with_wal = 0;
|
|
|
|
WriteBatch* to_be_cached_state = nullptr;
|
|
|
|
WriteBatch* merged_batch;
|
|
|
|
io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch, &merged_batch,
|
|
|
|
&write_with_wal, &to_be_cached_state));
|
|
|
|
if (UNLIKELY(!io_s.ok())) {
|
|
|
|
return io_s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We need to lock log_write_mutex_ since logs_ and alive_log_files might be
|
|
|
|
// pushed back concurrently
|
|
|
|
log_write_mutex_.Lock();
|
|
|
|
if (merged_batch == write_group.leader->batch) {
|
|
|
|
write_group.leader->log_used = logfile_number_;
|
|
|
|
} else if (write_with_wal > 1) {
|
|
|
|
for (auto writer : write_group) {
|
|
|
|
writer->log_used = logfile_number_;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
|
|
|
|
auto sequence = *last_sequence + 1;
|
|
|
|
WriteBatchInternal::SetSequence(merged_batch, sequence);
|
|
|
|
|
|
|
|
log::Writer* log_writer = logs_.back().writer;
|
|
|
|
LogFileNumberSize& log_file_number_size = alive_log_files_.back();
|
|
|
|
|
|
|
|
assert(log_writer->get_log_number() == log_file_number_size.number);
|
|
|
|
|
|
|
|
uint64_t log_size;
|
Rate-limit automatic WAL flush after each user write (#9607)
Summary:
**Context:**
WAL flush is currently not rate-limited by `Options::rate_limiter`. This PR is to provide rate-limiting to auto WAL flush, the one that automatically happen after each user write operation (i.e, `Options::manual_wal_flush == false`), by adding `WriteOptions::rate_limiter_options`.
Note that we are NOT rate-limiting WAL flush that do NOT automatically happen after each user write, such as `Options::manual_wal_flush == true + manual FlushWAL()` (rate-limiting multiple WAL flushes), for the benefits of:
- being consistent with [ReadOptions::rate_limiter_priority](https://github.com/facebook/rocksdb/blob/7.0.fb/include/rocksdb/options.h#L515)
- being able to turn off some WAL flush's rate-limiting but not all (e.g, turn off specific the WAL flush of a critical user write like a service's heartbeat)
`WriteOptions::rate_limiter_options` only accept `Env::IO_USER` and `Env::IO_TOTAL` currently due to an implementation constraint.
- The constraint is that we currently queue parallel writes (including WAL writes) based on FIFO policy which does not factor rate limiter priority into this layer's scheduling. If we allow lower priorities such as `Env::IO_HIGH/MID/LOW` and such writes specified with lower priorities occurs before ones specified with higher priorities (even just by a tiny bit in arrival time), the former would have blocked the latter, leading to a "priority inversion" issue and contradictory to what we promise for rate-limiting priority. Therefore we only allow `Env::IO_USER` and `Env::IO_TOTAL` right now before improving that scheduling.
A pre-requisite to this feature is to support operation-level rate limiting in `WritableFileWriter`, which is also included in this PR.
**Summary:**
- Renamed test suite `DBRateLimiterTest to DBRateLimiterOnReadTest` for adding a new test suite
- Accept `rate_limiter_priority` in `WritableFileWriter`'s private and public write functions
- Passed `WriteOptions::rate_limiter_options` to `WritableFileWriter` in the path of automatic WAL flush.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9607
Test Plan:
- Added new unit test to verify existing flush/compaction rate-limiting does not break, since `DBTest, RateLimitingTest` is disabled and current db-level rate-limiting tests focus on read only (e.g, `db_rate_limiter_test`, `DBTest2, RateLimitedCompactionReads`).
- Added new unit test `DBRateLimiterOnWriteWALTest, AutoWalFlush`
- `strace -ftt -e trace=write ./db_bench -benchmarks=fillseq -db=/dev/shm/testdb -rate_limit_auto_wal_flush=1 -rate_limiter_bytes_per_sec=15 -rate_limiter_refill_period_us=1000000 -write_buffer_size=100000000 -disable_auto_compactions=1 -num=100`
- verified that WAL flush(i.e, system-call _write_) were chunked into 15 bytes and each _write_ was roughly 1 second apart
- verified the chunking disappeared when `-rate_limit_auto_wal_flush=0`
- crash test: `python3 tools/db_crashtest.py blackbox --disable_wal=0 --rate_limit_auto_wal_flush=1 --rate_limiter_bytes_per_sec=10485760 --interval=10` killed as normal
**Benchmarked on flush/compaction to ensure no performance regression:**
- compaction with rate-limiting (see table 1, avg over 1280-run): pre-change: **915635 micros/op**; post-change:
**907350 micros/op (improved by 0.106%)**
```
#!/bin/bash
TEST_TMPDIR=/dev/shm/testdb
START=1
NUM_DATA_ENTRY=8
N=10
rm -f compact_bmk_output.txt compact_bmk_output_2.txt dont_care_output.txt
for i in $(eval echo "{$START..$NUM_DATA_ENTRY}")
do
NUM_RUN=$(($N*(2**($i-1))))
for j in $(eval echo "{$START..$NUM_RUN}")
do
./db_bench --benchmarks=fillrandom -db=$TEST_TMPDIR -disable_auto_compactions=1 -write_buffer_size=6710886 > dont_care_output.txt && ./db_bench --benchmarks=compact -use_existing_db=1 -db=$TEST_TMPDIR -level0_file_num_compaction_trigger=1 -rate_limiter_bytes_per_sec=100000000 | egrep 'compact'
done > compact_bmk_output.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' compact_bmk_output.txt >> compact_bmk_output_2.txt
done
```
- compaction w/o rate-limiting (see table 2, avg over 640-run): pre-change: **822197 micros/op**; post-change: **823148 micros/op (regressed by 0.12%)**
```
Same as above script, except that -rate_limiter_bytes_per_sec=0
```
- flush with rate-limiting (see table 3, avg over 320-run, run on the [patch](https://github.com/hx235/rocksdb/commit/ee5c6023a9f6533fab9afdc681568daa21da4953) to augment current db_bench ): pre-change: **745752 micros/op**; post-change: **745331 micros/op (regressed by 0.06 %)**
```
#!/bin/bash
TEST_TMPDIR=/dev/shm/testdb
START=1
NUM_DATA_ENTRY=8
N=10
rm -f flush_bmk_output.txt flush_bmk_output_2.txt
for i in $(eval echo "{$START..$NUM_DATA_ENTRY}")
do
NUM_RUN=$(($N*(2**($i-1))))
for j in $(eval echo "{$START..$NUM_RUN}")
do
./db_bench -db=$TEST_TMPDIR -write_buffer_size=1048576000 -num=1000000 -rate_limiter_bytes_per_sec=100000000 -benchmarks=fillseq,flush | egrep 'flush'
done > flush_bmk_output.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' flush_bmk_output.txt >> flush_bmk_output_2.txt
done
```
- flush w/o rate-limiting (see table 4, avg over 320-run, run on the [patch](https://github.com/hx235/rocksdb/commit/ee5c6023a9f6533fab9afdc681568daa21da4953) to augment current db_bench): pre-change: **487512 micros/op**, post-change: **485856 micors/ops (improved by 0.34%)**
```
Same as above script, except that -rate_limiter_bytes_per_sec=0
```
| table 1 - compact with rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 896978 | 16046.9 | 901242 | 15670.9 | 0.475373978
20 | 893718 | 15813 | 886505 | 17544.7 | -0.8070778478
40 | 900426 | 23882.2 | 894958 | 15104.5 | -0.6072681153
80 | 906635 | 21761.5 | 903332 | 23948.3 | -0.3643141948
160 | 898632 | 21098.9 | 907583 | 21145 | 0.9960695813
3.20E+02 | 905252 | 22785.5 | 908106 | 25325.5 | 0.3152713278
6.40E+02 | 905213 | 23598.6 | 906741 | 21370.5 | 0.1688000504
**1.28E+03** | **908316** | **23533.1** | **907350** | **24626.8** | **-0.1063506533**
average over #-run | 901896.25 | 21064.9625 | 901977.125 | 20592.025 | 0.008967217682
| table 2 - compact w/o rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 811211 | 26996.7 | 807586 | 28456.4 | -0.4468627768
20 | 815465 | 14803.7 | 814608 | 28719.7 | -0.105093413
40 | 809203 | 26187.1 | 797835 | 25492.1 | -1.404839082
80 | 822088 | 28765.3 | 822192 | 32840.4 | 0.01265071379
160 | 821719 | 36344.7 | 821664 | 29544.9 | -0.006693285661
3.20E+02 | 820921 | 27756.4 | 821403 | 28347.7 | 0.05871454135
**6.40E+02** | **822197** | **28960.6** | **823148** | **30055.1** | **0.1156657103**
average over #-run | 8.18E+05 | 2.71E+04 | 8.15E+05 | 2.91E+04 | -0.25
| table 3 - flush with rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 741721 | 11770.8 | 740345 | 5949.76 | -0.1855144994
20 | 735169 | 3561.83 | 743199 | 9755.77 | 1.09226586
40 | 743368 | 8891.03 | 742102 | 8683.22 | -0.1703059588
80 | 742129 | 8148.51 | 743417 | 9631.58| 0.1735547324
160 | 749045 | 9757.21 | 746256 | 9191.86 | -0.3723407806
**3.20E+02** | **745752** | **9819.65** | **745331** | **9840.62** | **-0.0564530836**
6.40E+02 | 749006 | 11080.5 | 748173 | 10578.7 | -0.1112140624
average over #-run | 743741.4286 | 9004.218571 | 744117.5714 | 9090.215714 | 0.05057441238
| table 4 - flush w/o rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 477283 | 24719.6 | 473864 | 12379 | -0.7163464863
20 | 486743 | 20175.2 | 502296 | 23931.3 | 3.195320734
40 | 482846 | 15309.2 | 489820 | 22259.5 | 1.444352858
80 | 491490 | 21883.1 | 490071 | 23085.7 | -0.2887139108
160 | 493347 | 28074.3 | 483609 | 21211.7 | -1.973864238
**3.20E+02** | **487512** | **21401.5** | **485856** | **22195.2** | **-0.3396839462**
6.40E+02 | 490307 | 25418.6 | 485435 | 22405.2 | -0.9936631539
average over #-run | 4.87E+05 | 2.24E+04 | 4.87E+05 | 2.11E+04 | 0.00E+00
Reviewed By: ajkr
Differential Revision: D34442441
Pulled By: hx235
fbshipit-source-id: 4790f13e1e5c0a95ae1d1cc93ffcf69dc6e78bdd
3 years ago
|
|
|
io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size,
|
Fix a TSAN-reported bug caused by concurrent accesss to std::deque (#9686)
Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9686
According to https://www.cplusplus.com/reference/deque/deque/back/,
"
The container is accessed (neither the const nor the non-const versions modify the container).
The last element is potentially accessed or modified by the caller. Concurrently accessing or modifying other elements is safe.
"
Also according to https://www.cplusplus.com/reference/deque/deque/pop_front/,
"
The container is modified.
The first element is modified. Concurrently accessing or modifying other elements is safe (although see iterator validity above).
"
In RocksDB, we never pop the last element of `DBImpl::alive_log_files_`. We have been
exploiting this fact and the above two properties when ensuring correctness when
`DBImpl::alive_log_files_` may be accessed concurrently. Specifically, it can be accessed
in the write path when db mutex is released. Sometimes, the log_mute_ is held. It can also be accessed in `FindObsoleteFiles()`
when db mutex is always held. It can also be accessed
during recovery when db mutex is also held.
Given the fact that we never pop the last element of alive_log_files_, we currently do not
acquire additional locks when accessing it in `WriteToWAL()` as follows
```
alive_log_files_.back().AddSize(log_entry.size());
```
This is problematic.
Check source code of deque.h
```
back() _GLIBCXX_NOEXCEPT
{
__glibcxx_requires_nonempty();
...
}
pop_front() _GLIBCXX_NOEXCEPT
{
...
if (this->_M_impl._M_start._M_cur
!= this->_M_impl._M_start._M_last - 1)
{
...
++this->_M_impl._M_start._M_cur;
}
...
}
```
`back()` will actually call `__glibcxx_requires_nonempty()` first.
If `__glibcxx_requires_nonempty()` is enabled and not an empty macro,
it will call `empty()`
```
bool empty() {
return this->_M_impl._M_finish == this->_M_impl._M_start;
}
```
You can see that it will access `this->_M_impl._M_start`, racing with `pop_front()`.
Therefore, TSAN will actually catch the bug in this case.
To be able to use TSAN on our library and unit tests, we should always coordinate
concurrent accesses to STL containers properly.
We need to pass information about db mutex and log mutex into `WriteToWAL()`, otherwise
it's impossible to know which mutex to acquire inside the function.
To fix this, we can catch the tail of `alive_log_files_` by reference, so that we do not have to call `back()` in `WriteToWAL()`.
Reviewed By: pdillinger
Differential Revision: D34780309
fbshipit-source-id: 1def9821f0c437f2736c6a26445d75890377889b
3 years ago
|
|
|
write_group.leader->rate_limiter_priority,
|
|
|
|
log_file_number_size);
|
|
|
|
if (to_be_cached_state) {
|
|
|
|
cached_recoverable_state_ = *to_be_cached_state;
|
|
|
|
cached_recoverable_state_empty_ = false;
|
|
|
|
}
|
|
|
|
log_write_mutex_.Unlock();
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
if (io_s.ok()) {
|
|
|
|
const bool concurrent = true;
|
|
|
|
auto stats = default_cf_internal_stats_;
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size,
|
|
|
|
concurrent);
|
|
|
|
RecordTick(stats_, WAL_FILE_BYTES, log_size);
|
|
|
|
stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal,
|
|
|
|
concurrent);
|
|
|
|
RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
|
|
|
|
}
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
return io_s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::WriteRecoverableState() {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
if (!cached_recoverable_state_empty_) {
|
|
|
|
bool dont_care_bool;
|
|
|
|
SequenceNumber next_seq;
|
|
|
|
if (two_write_queues_) {
|
|
|
|
log_write_mutex_.Lock();
|
|
|
|
}
|
|
|
|
SequenceNumber seq;
|
|
|
|
if (two_write_queues_) {
|
|
|
|
seq = versions_->FetchAddLastAllocatedSequence(0);
|
|
|
|
} else {
|
|
|
|
seq = versions_->LastSequence();
|
|
|
|
}
|
|
|
|
WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1);
|
|
|
|
auto status = WriteBatchInternal::InsertInto(
|
|
|
|
&cached_recoverable_state_, column_family_memtables_.get(),
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
&flush_scheduler_, &trim_history_scheduler_, true,
|
|
|
|
0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */,
|
|
|
|
&next_seq, &dont_care_bool, seq_per_batch_);
|
|
|
|
auto last_seq = next_seq - 1;
|
|
|
|
if (two_write_queues_) {
|
|
|
|
versions_->FetchAddLastAllocatedSequence(last_seq - seq);
|
|
|
|
versions_->SetLastPublishedSequence(last_seq);
|
|
|
|
}
|
|
|
|
versions_->SetLastSequence(last_seq);
|
|
|
|
if (two_write_queues_) {
|
|
|
|
log_write_mutex_.Unlock();
|
|
|
|
}
|
|
|
|
if (status.ok() && recoverable_state_pre_release_callback_) {
|
|
|
|
const bool DISABLE_MEMTABLE = true;
|
|
|
|
for (uint64_t sub_batch_seq = seq + 1;
|
|
|
|
sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) {
|
|
|
|
uint64_t const no_log_num = 0;
|
|
|
|
// Unlock it since the callback might end up locking mutex. e.g.,
|
|
|
|
// AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB
|
|
|
|
mutex_.Unlock();
|
|
|
|
status = recoverable_state_pre_release_callback_->Callback(
|
|
|
|
sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1);
|
|
|
|
mutex_.Lock();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (status.ok()) {
|
|
|
|
cached_recoverable_state_.Clear();
|
|
|
|
cached_recoverable_state_empty_ = true;
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::SelectColumnFamiliesForAtomicFlush(
|
|
|
|
autovector<ColumnFamilyData*>* cfds) {
|
|
|
|
for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
|
|
|
|
!cached_recoverable_state_empty_.load()) {
|
|
|
|
cfds->push_back(cfd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assign sequence number for atomic flush.
|
|
|
|
void DBImpl::AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds) {
|
|
|
|
assert(immutable_db_options_.atomic_flush);
|
|
|
|
auto seq = versions_->LastSequence();
|
|
|
|
for (auto cfd : cfds) {
|
|
|
|
cfd->imm()->AssignAtomicFlushSeq(seq);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::SwitchWAL(WriteContext* write_context) {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
assert(write_context != nullptr);
|
|
|
|
Status status;
|
|
|
|
|
|
|
|
if (alive_log_files_.begin()->getting_flushed) {
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
auto oldest_alive_log = alive_log_files_.begin()->number;
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
bool flush_wont_release_oldest_log = false;
|
|
|
|
if (allow_2pc()) {
|
|
|
|
auto oldest_log_with_uncommitted_prep =
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
|
|
|
|
|
|
|
|
assert(oldest_log_with_uncommitted_prep == 0 ||
|
|
|
|
oldest_log_with_uncommitted_prep >= oldest_alive_log);
|
|
|
|
if (oldest_log_with_uncommitted_prep > 0 &&
|
|
|
|
oldest_log_with_uncommitted_prep == oldest_alive_log) {
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
if (unable_to_release_oldest_log_) {
|
|
|
|
// we already attempted to flush all column families dependent on
|
|
|
|
// the oldest alive log but the log still contained uncommitted
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
// transactions so there is still nothing that we can do.
|
|
|
|
return status;
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
} else {
|
|
|
|
ROCKS_LOG_WARN(
|
|
|
|
immutable_db_options_.info_log,
|
|
|
|
"Unable to release oldest log due to uncommitted transaction");
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
unable_to_release_oldest_log_ = true;
|
|
|
|
flush_wont_release_oldest_log = true;
|
|
|
|
}
|
|
|
|
}
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
}
|
|
|
|
if (!flush_wont_release_oldest_log) {
|
|
|
|
// we only mark this log as getting flushed if we have successfully
|
|
|
|
// flushed all data in this log. If this log contains outstanding prepared
|
|
|
|
// transactions then we cannot flush this log until those transactions are
|
|
|
|
// commited.
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
7 years ago
|
|
|
unable_to_release_oldest_log_ = false;
|
|
|
|
alive_log_files_.begin()->getting_flushed = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKS_LOG_INFO(
|
|
|
|
immutable_db_options_.info_log,
|
|
|
|
"Flushing all column families with data in WAL number %" PRIu64
|
|
|
|
". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
|
|
|
|
oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
|
|
|
|
// no need to refcount because drop is happening in write thread, so can't
|
|
|
|
// happen while we're in the write thread
|
|
|
|
autovector<ColumnFamilyData*> cfds;
|
|
|
|
if (immutable_db_options_.atomic_flush) {
|
|
|
|
SelectColumnFamiliesForAtomicFlush(&cfds);
|
|
|
|
} else {
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (cfd->OldestLogToKeep() <= oldest_alive_log) {
|
|
|
|
cfds.push_back(cfd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
MaybeFlushStatsCF(&cfds);
|
|
|
|
}
|
|
|
|
WriteThread::Writer nonmem_w;
|
|
|
|
if (two_write_queues_) {
|
|
|
|
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto cfd : cfds) {
|
|
|
|
cfd->Ref();
|
|
|
|
status = SwitchMemtable(cfd, write_context);
|
|
|
|
cfd->UnrefAndTryDelete();
|
|
|
|
if (!status.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (two_write_queues_) {
|
|
|
|
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (status.ok()) {
|
|
|
|
if (immutable_db_options_.atomic_flush) {
|
|
|
|
AssignAtomicFlushSeq(cfds);
|
|
|
|
}
|
|
|
|
for (auto cfd : cfds) {
|
|
|
|
cfd->imm()->FlushRequested();
|
|
|
|
if (!immutable_db_options_.atomic_flush) {
|
|
|
|
FlushRequest flush_req;
|
|
|
|
GenerateFlushRequest({cfd}, &flush_req);
|
|
|
|
SchedulePendingFlush(flush_req, FlushReason::kWalFull);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (immutable_db_options_.atomic_flush) {
|
|
|
|
FlushRequest flush_req;
|
|
|
|
GenerateFlushRequest(cfds, &flush_req);
|
|
|
|
SchedulePendingFlush(flush_req, FlushReason::kWalFull);
|
|
|
|
}
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
assert(write_context != nullptr);
|
|
|
|
Status status;
|
|
|
|
|
|
|
|
// Before a new memtable is added in SwitchMemtable(),
|
|
|
|
// write_buffer_manager_->ShouldFlush() will keep returning true. If another
|
|
|
|
// thread is writing to another DB with the same write buffer, they may also
|
|
|
|
// be flushed. We may end up with flushing much more DBs than needed. It's
|
|
|
|
// suboptimal but still correct.
|
|
|
|
// no need to refcount because drop is happening in write thread, so can't
|
|
|
|
// happen while we're in the write thread
|
|
|
|
autovector<ColumnFamilyData*> cfds;
|
|
|
|
if (immutable_db_options_.atomic_flush) {
|
|
|
|
SelectColumnFamiliesForAtomicFlush(&cfds);
|
|
|
|
} else {
|
|
|
|
ColumnFamilyData* cfd_picked = nullptr;
|
|
|
|
SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber;
|
|
|
|
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!cfd->mem()->IsEmpty() && !cfd->imm()->IsFlushPendingOrRunning()) {
|
|
|
|
// We only consider flush on CFs with bytes in the mutable memtable,
|
|
|
|
// and no immutable memtables for which flush has yet to finish. If
|
|
|
|
// we triggered flush on CFs already trying to flush, we would risk
|
|
|
|
// creating too many immutable memtables leading to write stalls.
|
|
|
|
uint64_t seq = cfd->mem()->GetCreationSeq();
|
|
|
|
if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) {
|
|
|
|
cfd_picked = cfd;
|
|
|
|
seq_num_for_cf_picked = seq;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (cfd_picked != nullptr) {
|
|
|
|
cfds.push_back(cfd_picked);
|
|
|
|
}
|
|
|
|
MaybeFlushStatsCF(&cfds);
|
|
|
|
}
|
|
|
|
if (!cfds.empty()) {
|
|
|
|
ROCKS_LOG_INFO(
|
|
|
|
immutable_db_options_.info_log,
|
|
|
|
"Flushing triggered to alleviate write buffer memory usage. Write "
|
|
|
|
"buffer is using %" ROCKSDB_PRIszt
|
|
|
|
" bytes out of a total of %" ROCKSDB_PRIszt ".",
|
|
|
|
write_buffer_manager_->memory_usage(),
|
|
|
|
write_buffer_manager_->buffer_size());
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteThread::Writer nonmem_w;
|
|
|
|
if (two_write_queues_) {
|
|
|
|
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
|
|
|
}
|
|
|
|
for (const auto cfd : cfds) {
|
|
|
|
if (cfd->mem()->IsEmpty()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
cfd->Ref();
|
|
|
|
status = SwitchMemtable(cfd, write_context);
|
|
|
|
cfd->UnrefAndTryDelete();
|
|
|
|
if (!status.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (two_write_queues_) {
|
|
|
|
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (status.ok()) {
|
|
|
|
if (immutable_db_options_.atomic_flush) {
|
|
|
|
AssignAtomicFlushSeq(cfds);
|
|
|
|
}
|
|
|
|
for (const auto cfd : cfds) {
|
|
|
|
cfd->imm()->FlushRequested();
|
|
|
|
if (!immutable_db_options_.atomic_flush) {
|
|
|
|
FlushRequest flush_req;
|
|
|
|
GenerateFlushRequest({cfd}, &flush_req);
|
|
|
|
SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (immutable_db_options_.atomic_flush) {
|
|
|
|
FlushRequest flush_req;
|
|
|
|
GenerateFlushRequest(cfds, &flush_req);
|
|
|
|
SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
|
|
|
|
}
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t DBImpl::GetMaxTotalWalSize() const {
|
|
|
|
uint64_t max_total_wal_size =
|
|
|
|
max_total_wal_size_.load(std::memory_order_acquire);
|
|
|
|
if (max_total_wal_size > 0) {
|
|
|
|
return max_total_wal_size;
|
|
|
|
}
|
|
|
|
return 4 * max_total_in_memory_state_.load(std::memory_order_acquire);
|
|
|
|
}
|
|
|
|
|
|
|
|
// REQUIRES: mutex_ is held
|
|
|
|
// REQUIRES: this thread is currently at the front of the writer queue
|
|
|
|
Status DBImpl::DelayWrite(uint64_t num_bytes,
|
|
|
|
const WriteOptions& write_options) {
|
|
|
|
uint64_t time_delayed = 0;
|
|
|
|
bool delayed = false;
|
|
|
|
{
|
|
|
|
StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL,
|
|
|
|
&time_delayed);
|
|
|
|
uint64_t delay =
|
|
|
|
write_controller_.GetDelay(immutable_db_options_.clock, num_bytes);
|
|
|
|
TEST_SYNC_POINT("DBImpl::DelayWrite:Start");
|
|
|
|
if (delay > 0) {
|
|
|
|
if (write_options.no_slowdown) {
|
|
|
|
return Status::Incomplete("Write stall");
|
|
|
|
}
|
|
|
|
TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep");
|
|
|
|
|
|
|
|
// Notify write_thread_ about the stall so it can setup a barrier and
|
|
|
|
// fail any pending writers with no_slowdown
|
|
|
|
write_thread_.BeginWriteStall();
|
|
|
|
mutex_.Unlock();
|
|
|
|
TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone");
|
|
|
|
// We will delay the write until we have slept for `delay` microseconds
|
|
|
|
// or we don't need a delay anymore. We check for cancellation every 1ms
|
|
|
|
// (slightly longer because WriteController minimum delay is 1ms, in
|
|
|
|
// case of sleep imprecision, rounding, etc.)
|
|
|
|
const uint64_t kDelayInterval = 1001;
|
|
|
|
uint64_t stall_end = sw.start_time() + delay;
|
|
|
|
while (write_controller_.NeedsDelay()) {
|
|
|
|
if (immutable_db_options_.clock->NowMicros() >= stall_end) {
|
|
|
|
// We already delayed this write `delay` microseconds
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
delayed = true;
|
|
|
|
// Sleep for 0.001 seconds
|
|
|
|
immutable_db_options_.clock->SleepForMicroseconds(kDelayInterval);
|
|
|
|
}
|
|
|
|
mutex_.Lock();
|
|
|
|
write_thread_.EndWriteStall();
|
|
|
|
}
|
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
6 years ago
|
|
|
// Don't wait if there's a background error, even if its a soft error. We
|
|
|
|
// might wait here indefinitely as the background compaction may never
|
|
|
|
// finish successfully, resulting in the stall condition lasting
|
|
|
|
// indefinitely
|
|
|
|
while (error_handler_.GetBGError().ok() && write_controller_.IsStopped() &&
|
|
|
|
!shutting_down_.load(std::memory_order_relaxed)) {
|
|
|
|
if (write_options.no_slowdown) {
|
|
|
|
return Status::Incomplete("Write stall");
|
|
|
|
}
|
|
|
|
delayed = true;
|
|
|
|
|
|
|
|
// Notify write_thread_ about the stall so it can setup a barrier and
|
|
|
|
// fail any pending writers with no_slowdown
|
|
|
|
write_thread_.BeginWriteStall();
|
|
|
|
TEST_SYNC_POINT("DBImpl::DelayWrite:Wait");
|
|
|
|
bg_cv_.Wait();
|
|
|
|
write_thread_.EndWriteStall();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(!delayed || !write_options.no_slowdown);
|
|
|
|
if (delayed) {
|
|
|
|
default_cf_internal_stats_->AddDBStats(
|
|
|
|
InternalStats::kIntStatsWriteStallMicros, time_delayed);
|
|
|
|
RecordTick(stats_, STALL_MICROS, time_delayed);
|
|
|
|
}
|
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
6 years ago
|
|
|
// If DB is not in read-only mode and write_controller is not stopping
|
|
|
|
// writes, we can ignore any background errors and allow the write to
|
|
|
|
// proceed
|
|
|
|
Status s;
|
|
|
|
if (write_controller_.IsStopped()) {
|
|
|
|
if (!shutting_down_.load(std::memory_order_relaxed)) {
|
|
|
|
// If writes are still stopped and db not shutdown, it means we bailed
|
|
|
|
// due to a background error
|
|
|
|
s = Status::Incomplete(error_handler_.GetBGError().ToString());
|
|
|
|
} else {
|
|
|
|
s = Status::ShutdownInProgress("stalled writes");
|
|
|
|
}
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
6 years ago
|
|
|
}
|
|
|
|
if (error_handler_.IsDBStopped()) {
|
|
|
|
s = error_handler_.GetBGError();
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// REQUIRES: mutex_ is held
|
|
|
|
// REQUIRES: this thread is currently at the front of the writer queue
|
|
|
|
void DBImpl::WriteBufferManagerStallWrites() {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
// First block future writer threads who want to add themselves to the queue
|
|
|
|
// of WriteThread.
|
|
|
|
write_thread_.BeginWriteStall();
|
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
// Change the state to State::Blocked.
|
|
|
|
static_cast<WBMStallInterface*>(wbm_stall_.get())
|
|
|
|
->SetState(WBMStallInterface::State::BLOCKED);
|
|
|
|
// Then WriteBufferManager will add DB instance to its queue
|
|
|
|
// and block this thread by calling WBMStallInterface::Block().
|
|
|
|
write_buffer_manager_->BeginWriteStall(wbm_stall_.get());
|
|
|
|
wbm_stall_->Block();
|
|
|
|
|
|
|
|
mutex_.Lock();
|
|
|
|
// Stall has ended. Signal writer threads so that they can add
|
|
|
|
// themselves to the WriteThread queue for writes.
|
|
|
|
write_thread_.EndWriteStall();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
|
|
|
|
WriteBatch* my_batch) {
|
|
|
|
assert(write_options.low_pri);
|
|
|
|
// This is called outside the DB mutex. Although it is safe to make the call,
|
|
|
|
// the consistency condition is not guaranteed to hold. It's OK to live with
|
|
|
|
// it in this case.
|
|
|
|
// If we need to speed compaction, it means the compaction is left behind
|
|
|
|
// and we start to limit low pri writes to a limit.
|
|
|
|
if (write_controller_.NeedSpeedupCompaction()) {
|
|
|
|
if (allow_2pc() && (my_batch->HasCommit() || my_batch->HasRollback())) {
|
|
|
|
// For 2PC, we only rate limit prepare, not commit.
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
if (write_options.no_slowdown) {
|
|
|
|
return Status::Incomplete("Low priority write stall");
|
|
|
|
} else {
|
|
|
|
assert(my_batch != nullptr);
|
|
|
|
// Rate limit those writes. The reason that we don't completely wait
|
|
|
|
// is that in case the write is heavy, low pri writes may never have
|
|
|
|
// a chance to run. Now we guarantee we are still slowly making
|
|
|
|
// progress.
|
|
|
|
PERF_TIMER_GUARD(write_delay_time);
|
|
|
|
write_controller_.low_pri_rate_limiter()->Request(
|
|
|
|
my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */,
|
|
|
|
RateLimiter::OpType::kWrite);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds) {
|
|
|
|
assert(cfds != nullptr);
|
|
|
|
if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) {
|
|
|
|
ColumnFamilyData* cfd_stats =
|
|
|
|
versions_->GetColumnFamilySet()->GetColumnFamily(
|
|
|
|
kPersistentStatsColumnFamilyName);
|
|
|
|
if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) {
|
|
|
|
for (ColumnFamilyData* cfd : *cfds) {
|
|
|
|
if (cfd == cfd_stats) {
|
|
|
|
// stats CF already included in cfds
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// force flush stats CF when its log number is less than all other CF's
|
|
|
|
// log numbers
|
|
|
|
bool force_flush_stats_cf = true;
|
|
|
|
for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
if (loop_cfd == cfd_stats) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
|
|
|
|
force_flush_stats_cf = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (force_flush_stats_cf) {
|
|
|
|
cfds->push_back(cfd_stats);
|
|
|
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
|
|
"Force flushing stats CF with automated flush "
|
|
|
|
"to avoid holding old logs");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
Status DBImpl::TrimMemtableHistory(WriteContext* context) {
|
|
|
|
autovector<ColumnFamilyData*> cfds;
|
|
|
|
ColumnFamilyData* tmp_cfd;
|
|
|
|
while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) !=
|
|
|
|
nullptr) {
|
|
|
|
cfds.push_back(tmp_cfd);
|
|
|
|
}
|
|
|
|
for (auto& cfd : cfds) {
|
|
|
|
autovector<MemTable*> to_delete;
|
|
|
|
bool trimmed = cfd->imm()->TrimHistory(&context->memtables_to_free_,
|
|
|
|
cfd->mem()->MemoryAllocatedBytes());
|
|
|
|
if (trimmed) {
|
|
|
|
context->superversion_context.NewSuperVersion();
|
|
|
|
assert(context->superversion_context.new_superversion.get() != nullptr);
|
|
|
|
cfd->InstallSuperVersion(&context->superversion_context, &mutex_);
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
}
|
|
|
|
|
|
|
|
if (cfd->UnrefAndTryDelete()) {
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
cfd = nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::ScheduleFlushes(WriteContext* context) {
|
|
|
|
autovector<ColumnFamilyData*> cfds;
|
|
|
|
if (immutable_db_options_.atomic_flush) {
|
|
|
|
SelectColumnFamiliesForAtomicFlush(&cfds);
|
|
|
|
for (auto cfd : cfds) {
|
|
|
|
cfd->Ref();
|
|
|
|
}
|
|
|
|
flush_scheduler_.Clear();
|
|
|
|
} else {
|
|
|
|
ColumnFamilyData* tmp_cfd;
|
|
|
|
while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
|
|
|
|
cfds.push_back(tmp_cfd);
|
|
|
|
}
|
|
|
|
MaybeFlushStatsCF(&cfds);
|
|
|
|
}
|
|
|
|
Status status;
|
|
|
|
WriteThread::Writer nonmem_w;
|
|
|
|
if (two_write_queues_) {
|
|
|
|
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto& cfd : cfds) {
|
|
|
|
if (!cfd->mem()->IsEmpty()) {
|
|
|
|
status = SwitchMemtable(cfd, context);
|
|
|
|
}
|
|
|
|
if (cfd->UnrefAndTryDelete()) {
|
|
|
|
cfd = nullptr;
|
|
|
|
}
|
|
|
|
if (!status.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (two_write_queues_) {
|
|
|
|
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (status.ok()) {
|
|
|
|
if (immutable_db_options_.atomic_flush) {
|
|
|
|
AssignAtomicFlushSeq(cfds);
|
|
|
|
FlushRequest flush_req;
|
|
|
|
GenerateFlushRequest(cfds, &flush_req);
|
|
|
|
SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
|
|
|
|
} else {
|
|
|
|
for (auto* cfd : cfds) {
|
|
|
|
FlushRequest flush_req;
|
|
|
|
GenerateFlushRequest({cfd}, &flush_req);
|
|
|
|
SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
|
|
|
|
const MemTableInfo& mem_table_info) {
|
|
|
|
if (immutable_db_options_.listeners.size() == 0U) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (shutting_down_.load(std::memory_order_acquire)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_.Unlock();
|
|
|
|
for (auto listener : immutable_db_options_.listeners) {
|
|
|
|
listener->OnMemTableSealed(mem_table_info);
|
|
|
|
}
|
|
|
|
mutex_.Lock();
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
|
|
|
|
// REQUIRES: mutex_ is held
|
|
|
|
// REQUIRES: this thread is currently at the front of the writer queue
|
|
|
|
// REQUIRES: this thread is currently at the front of the 2nd writer queue if
|
|
|
|
// two_write_queues_ is true (This is to simplify the reasoning.)
|
|
|
|
Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
log::Writer* new_log = nullptr;
|
|
|
|
MemTable* new_mem = nullptr;
|
|
|
|
IOStatus io_s;
|
|
|
|
|
|
|
|
// Recoverable state is persisted in WAL. After memtable switch, WAL might
|
|
|
|
// be deleted, so we write the state to memtable to be persisted as well.
|
|
|
|
Status s = WriteRecoverableState();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Attempt to switch to a new memtable and trigger flush of old.
|
|
|
|
// Do this without holding the dbmutex lock.
|
|
|
|
assert(versions_->prev_log_number() == 0);
|
|
|
|
if (two_write_queues_) {
|
|
|
|
log_write_mutex_.Lock();
|
|
|
|
}
|
|
|
|
bool creating_new_log = !log_empty_;
|
|
|
|
if (two_write_queues_) {
|
|
|
|
log_write_mutex_.Unlock();
|
|
|
|
}
|
|
|
|
uint64_t recycle_log_number = 0;
|
|
|
|
if (creating_new_log && immutable_db_options_.recycle_log_file_num &&
|
|
|
|
!log_recycle_files_.empty()) {
|
|
|
|
recycle_log_number = log_recycle_files_.front();
|
|
|
|
}
|
|
|
|
uint64_t new_log_number =
|
|
|
|
creating_new_log ? versions_->NewFileNumber() : logfile_number_;
|
|
|
|
const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
|
|
|
|
|
|
|
|
// Set memtable_info for memtable sealed callback
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
MemTableInfo memtable_info;
|
|
|
|
memtable_info.cf_name = cfd->GetName();
|
|
|
|
memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber();
|
|
|
|
memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
|
|
|
|
memtable_info.num_entries = cfd->mem()->num_entries();
|
|
|
|
memtable_info.num_deletes = cfd->mem()->num_deletes();
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
// Log this later after lock release. It may be outdated, e.g., if background
|
|
|
|
// flush happens before logging, but that should be ok.
|
|
|
|
int num_imm_unflushed = cfd->imm()->NumNotFlushed();
|
|
|
|
const auto preallocate_block_size =
|
|
|
|
GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size);
|
|
|
|
mutex_.Unlock();
|
|
|
|
if (creating_new_log) {
|
|
|
|
// TODO: Write buffer size passed in should be max of all CF's instead
|
|
|
|
// of mutable_cf_options.write_buffer_size.
|
|
|
|
io_s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size,
|
|
|
|
&new_log);
|
|
|
|
if (s.ok()) {
|
|
|
|
s = io_s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
SequenceNumber seq = versions_->LastSequence();
|
|
|
|
new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq);
|
|
|
|
context->superversion_context.NewSuperVersion();
|
|
|
|
}
|
|
|
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
|
|
"[%s] New memtable created with log file: #%" PRIu64
|
|
|
|
". Immutable memtables: %d.\n",
|
|
|
|
cfd->GetName().c_str(), new_log_number, num_imm_unflushed);
|
|
|
|
// There should be no concurrent write as the thread is at the front of
|
|
|
|
// writer queue
|
|
|
|
cfd->mem()->ConstructFragmentedRangeTombstones();
|
|
|
|
|
|
|
|
mutex_.Lock();
|
|
|
|
if (recycle_log_number != 0) {
|
|
|
|
// Since renaming the file is done outside DB mutex, we need to ensure
|
|
|
|
// concurrent full purges don't delete the file while we're recycling it.
|
|
|
|
// To achieve that we hold the old log number in the recyclable list until
|
|
|
|
// after it has been renamed.
|
|
|
|
assert(log_recycle_files_.front() == recycle_log_number);
|
|
|
|
log_recycle_files_.pop_front();
|
|
|
|
}
|
|
|
|
if (s.ok() && creating_new_log) {
|
|
|
|
InstrumentedMutexLock l(&log_write_mutex_);
|
|
|
|
assert(new_log != nullptr);
|
|
|
|
if (!logs_.empty()) {
|
|
|
|
// Alway flush the buffer of the last log before switching to a new one
|
|
|
|
log::Writer* cur_log_writer = logs_.back().writer;
|
|
|
|
if (error_handler_.IsRecoveryInProgress()) {
|
|
|
|
// In recovery path, we force another try of writing WAL buffer.
|
|
|
|
cur_log_writer->file()->reset_seen_error();
|
|
|
|
}
|
|
|
|
io_s = cur_log_writer->WriteBuffer();
|
|
|
|
if (s.ok()) {
|
|
|
|
s = io_s;
|
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
ROCKS_LOG_WARN(immutable_db_options_.info_log,
|
|
|
|
"[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64
|
|
|
|
" WAL file\n",
|
|
|
|
cfd->GetName().c_str(), cur_log_writer->get_log_number(),
|
|
|
|
new_log_number);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
logfile_number_ = new_log_number;
|
|
|
|
log_empty_ = true;
|
|
|
|
log_dir_synced_ = false;
|
|
|
|
logs_.emplace_back(logfile_number_, new_log);
|
|
|
|
alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
// how do we fail if we're not creating new log?
|
|
|
|
assert(creating_new_log);
|
|
|
|
delete new_mem;
|
|
|
|
delete new_log;
|
|
|
|
context->superversion_context.new_superversion.reset();
|
|
|
|
// We may have lost data from the WritableFileBuffer in-memory buffer for
|
|
|
|
// the current log, so treat it as a fatal error and set bg_error
|
|
|
|
if (!io_s.ok()) {
|
|
|
|
error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable);
|
|
|
|
} else {
|
|
|
|
error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
|
|
|
|
}
|
|
|
|
// Read back bg_error in order to get the right severity
|
|
|
|
s = error_handler_.GetBGError();
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Track WAL obsoletion when updating empty CF's log number (#7781)
Summary:
In the write path, there is an optimization: when a new WAL is created during SwitchMemtable, we update the internal log number of the empty column families to the new WAL. `FindObsoleteFiles` marks a WAL as obsolete if the WAL's log number is less than `VersionSet::MinLogNumberWithUnflushedData`. After updating the empty column families' internal log number, `VersionSet::MinLogNumberWithUnflushedData` might change, so some WALs might become obsolete to be purged from disk.
For example, consider there are 3 column families: 0, 1, 2:
1. initially, all the column families' log number is 1;
2. write some data to cf0, and flush cf0, but the flush is pending;
3. now a new WAL 2 is created;
4. write data to cf1 and WAL 2, now cf0's log number is 1, cf1's log number is 2, cf2's log number is 2 (because cf1 and cf2 are empty, so their log numbers will be set to the highest log number);
5. now cf0's flush hasn't finished, flush cf1, a new WAL 3 is created, and cf1's flush finishes, now cf0's log number is 1, cf1's log number is 3, cf2's log number is 3, since WAL 1 still contains data for the unflushed cf0, no WAL can be deleted from disk;
6. now cf0's flush finishes, cf0's log number is 2 (because when cf0 was switching memtable, WAL 3 does not exist yet), cf1's log number is 3, cf2's log number is 3, so WAL 1 can be purged from disk now, but WAL 2 still cannot because `MinLogNumberToKeep()` is 2;
7. write data to cf2 and WAL 3, because cf0 is empty, its log number is updated to 3, so now cf0's log number is 3, cf1's log number is 3, cf2's log number is 3;
8. now if the background threads want to purge obsolete files from disk, WAL 2 can be purged because `MinLogNumberToKeep()` is 3. But there are only two flush results written to MANIFEST: the first is for flushing cf1, and the `MinLogNumberToKeep` is 1, the second is for flushing cf0, and the `MinLogNumberToKeep` is 2. So without this PR, if the DB crashes at this point and try to recover, `WalSet` will still expect WAL 2 to exist.
When WAL tracking is enabled, we assume WALs will only become obsolete after a flush result is written to MANIFEST in `MemtableList::TryInstallMemtableFlushResults` (or its atomic flush counterpart). The above situation breaks this assumption.
This PR tracks WAL obsoletion if necessary before updating the empty column families' log numbers.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7781
Test Plan:
watch existing tests and stress tests to pass.
`make -j48 blackbox_crash_test` on devserver
Reviewed By: ltamasi
Differential Revision: D25631695
Pulled By: cheng-chang
fbshipit-source-id: ca7fff967bdb42204b84226063d909893bc0a4ec
4 years ago
|
|
|
bool empty_cf_updated = false;
|
|
|
|
if (immutable_db_options_.track_and_verify_wals_in_manifest &&
|
|
|
|
!immutable_db_options_.allow_2pc && creating_new_log) {
|
|
|
|
// In non-2pc mode, WALs become obsolete if they do not contain unflushed
|
|
|
|
// data. Updating the empty CF's log number might cause some WALs to become
|
|
|
|
// obsolete. So we should track the WAL obsoletion event before actually
|
|
|
|
// updating the empty CF's log number.
|
|
|
|
uint64_t min_wal_number_to_keep =
|
|
|
|
versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_);
|
|
|
|
if (min_wal_number_to_keep >
|
|
|
|
versions_->GetWalSet().GetMinWalNumberToKeep()) {
|
|
|
|
// Get a snapshot of the empty column families.
|
|
|
|
// LogAndApply may release and reacquire db
|
|
|
|
// mutex, during that period, column family may become empty (e.g. its
|
|
|
|
// flush succeeds), then it affects the computed min_log_number_to_keep,
|
|
|
|
// so we take a snapshot for consistency of column family data
|
|
|
|
// status. If a column family becomes non-empty afterwards, its active log
|
|
|
|
// should still be the created new log, so the min_log_number_to_keep is
|
|
|
|
// not affected.
|
|
|
|
autovector<ColumnFamilyData*> empty_cfs;
|
|
|
|
for (auto cf : *versions_->GetColumnFamilySet()) {
|
|
|
|
if (cf->IsEmpty()) {
|
|
|
|
empty_cfs.push_back(cf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
VersionEdit wal_deletion;
|
|
|
|
wal_deletion.DeleteWalsBefore(min_wal_number_to_keep);
|
Sync dir containing CURRENT after RenameFile on CURRENT as much as possible (#10573)
Summary:
**Context:**
Below crash test revealed a bug that directory containing CURRENT file (short for `dir_contains_current_file` below) was not always get synced after a new CURRENT is created and being called with `RenameFile` as part of the creation.
This bug exposes a risk that such un-synced directory containing the updated CURRENT can’t survive a host crash (e.g, power loss) hence get corrupted. This then will be followed by a recovery from a corrupted CURRENT that we don't want.
The root-cause is that a nullptr `FSDirectory* dir_contains_current_file` sometimes gets passed-down to `SetCurrentFile()` hence in those case `dir_contains_current_file->FSDirectory::FsyncWithDirOptions()` will be skipped (which otherwise will internally call`Env/FS::SyncDic()` )
```
./db_stress --acquire_snapshot_one_in=10000 --adaptive_readahead=1 --allow_data_in_errors=True --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=8 --block_size=16384 --bloom_bits=134.8015470676662 --bottommost_compression_type=disable --cache_size=8388608 --checkpoint_one_in=1000000 --checksum_type=kCRC32c --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_pri=2 --compaction_ttl=100 --compression_max_dict_buffer_bytes=511 --compression_max_dict_bytes=16384 --compression_type=zstd --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=65536 --continuous_verification_interval=0 --data_block_index_type=0 --db=$db --db_write_buffer_size=1048576 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --disable_wal=0 --enable_compaction_filter=0 --enable_pipelined_write=1 --expected_values_dir=$exp --fail_if_options_file_error=1 --file_checksum_impl=none --flush_one_in=1000000 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=4 --ingest_external_file_one_in=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True --mark_for_compaction_one_file_in=10 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=10000 --max_key_len=3 --max_manifest_file_size=16384 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=1 --memtable_whole_key_filtering=1 --mmap_read=1 --nooverwritepercent=1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=1 --partition_pinning=2 --pause_background_one_in=1000000 --periodic_compaction_seconds=0 --prefix_size=5 --prefixpercent=5 --prepopulate_block_cache=1 --progress_reports=0 --read_fault_one_in=1000 --readpercent=45 --recycle_log_file_num=0 --reopen=0 --ribbon_starting_level=999 --secondary_cache_fault_one_in=32 --secondary_cache_uri=compressed_secondary_cache://capacity=8388608 --set_options_one_in=10000 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --subcompactions=3 --sync_fault_injection=1 --target_file_size_base=2097 --target_file_size_multiplier=2 --test_batches_snapshots=1 --top_level_index_pinning=1 --use_full_merge_v1=1 --use_merge=1 --value_size_mult=32 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_db_one_in=100000 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=524288 --write_buffer_size=4194 --writepercent=35
```
```
stderr:
WARNING: prefix_size is non-zero but memtablerep != prefix_hash
db_stress: utilities/fault_injection_fs.cc:748: virtual rocksdb::IOStatus rocksdb::FaultInjectionTestFS::RenameFile(const std::string &, const std::string &, const rocksdb::IOOptions &, rocksdb::IODebugContext *): Assertion `tlist.find(tdn.second) == tlist.end()' failed.`
```
**Summary:**
The PR ensured the non-test path pass down a non-null dir containing CURRENT (which is by current RocksDB assumption just db_dir) by doing the following:
- Renamed `directory_to_fsync` as `dir_contains_current_file` in `SetCurrentFile()` to tighten the association between this directory and CURRENT file
- Changed `SetCurrentFile()` API to require `dir_contains_current_file` being passed-in, instead of making it by default nullptr.
- Because `SetCurrentFile()`'s `dir_contains_current_file` is passed down from `VersionSet::LogAndApply()` then `VersionSet::ProcessManifestWrites()` (i.e, think about this as a chain of 3 functions related to MANIFEST update), these 2 functions also got refactored to require `dir_contains_current_file`
- Updated the non-test-path callers of these 3 functions to obtain and pass in non-nullptr `dir_contains_current_file`, which by current assumption of RocksDB, is the `FSDirectory* db_dir`.
- `db_impl` path will obtain `DBImpl::directories_.getDbDir()` while others with no access to such `directories_` are obtained on the fly by creating such object `FileSystem::NewDirectory(..)` and manage it by unique pointers to ensure short life time.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10573
Test Plan:
- `make check`
- Passed the repro db_stress command
- For future improvement, since we currently don't assert dir containing CURRENT to be non-nullptr due to https://github.com/facebook/rocksdb/pull/10573#pullrequestreview-1087698899, there is still chances that future developers mistakenly pass down nullptr dir containing CURRENT thus resulting skipped sync dir and cause the bug again. Therefore a smarter test (e.g, such as quoted from ajkr "(make) unsynced data loss to be dropping files corresponding to unsynced directory entries") is still needed.
Reviewed By: ajkr
Differential Revision: D39005886
Pulled By: hx235
fbshipit-source-id: 336fb9090d0cfa6ca3dd580db86268007dde7f5a
2 years ago
|
|
|
s = versions_->LogAndApplyToDefaultColumnFamily(&wal_deletion, &mutex_,
|
|
|
|
directories_.GetDbDir());
|
Track WAL obsoletion when updating empty CF's log number (#7781)
Summary:
In the write path, there is an optimization: when a new WAL is created during SwitchMemtable, we update the internal log number of the empty column families to the new WAL. `FindObsoleteFiles` marks a WAL as obsolete if the WAL's log number is less than `VersionSet::MinLogNumberWithUnflushedData`. After updating the empty column families' internal log number, `VersionSet::MinLogNumberWithUnflushedData` might change, so some WALs might become obsolete to be purged from disk.
For example, consider there are 3 column families: 0, 1, 2:
1. initially, all the column families' log number is 1;
2. write some data to cf0, and flush cf0, but the flush is pending;
3. now a new WAL 2 is created;
4. write data to cf1 and WAL 2, now cf0's log number is 1, cf1's log number is 2, cf2's log number is 2 (because cf1 and cf2 are empty, so their log numbers will be set to the highest log number);
5. now cf0's flush hasn't finished, flush cf1, a new WAL 3 is created, and cf1's flush finishes, now cf0's log number is 1, cf1's log number is 3, cf2's log number is 3, since WAL 1 still contains data for the unflushed cf0, no WAL can be deleted from disk;
6. now cf0's flush finishes, cf0's log number is 2 (because when cf0 was switching memtable, WAL 3 does not exist yet), cf1's log number is 3, cf2's log number is 3, so WAL 1 can be purged from disk now, but WAL 2 still cannot because `MinLogNumberToKeep()` is 2;
7. write data to cf2 and WAL 3, because cf0 is empty, its log number is updated to 3, so now cf0's log number is 3, cf1's log number is 3, cf2's log number is 3;
8. now if the background threads want to purge obsolete files from disk, WAL 2 can be purged because `MinLogNumberToKeep()` is 3. But there are only two flush results written to MANIFEST: the first is for flushing cf1, and the `MinLogNumberToKeep` is 1, the second is for flushing cf0, and the `MinLogNumberToKeep` is 2. So without this PR, if the DB crashes at this point and try to recover, `WalSet` will still expect WAL 2 to exist.
When WAL tracking is enabled, we assume WALs will only become obsolete after a flush result is written to MANIFEST in `MemtableList::TryInstallMemtableFlushResults` (or its atomic flush counterpart). The above situation breaks this assumption.
This PR tracks WAL obsoletion if necessary before updating the empty column families' log numbers.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7781
Test Plan:
watch existing tests and stress tests to pass.
`make -j48 blackbox_crash_test` on devserver
Reviewed By: ltamasi
Differential Revision: D25631695
Pulled By: cheng-chang
fbshipit-source-id: ca7fff967bdb42204b84226063d909893bc0a4ec
4 years ago
|
|
|
if (!s.ok() && versions_->io_status().IsIOError()) {
|
|
|
|
s = error_handler_.SetBGError(versions_->io_status(),
|
|
|
|
BackgroundErrorReason::kManifestWrite);
|
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto cf : empty_cfs) {
|
|
|
|
if (cf->IsEmpty()) {
|
|
|
|
cf->SetLogNumber(logfile_number_);
|
Memtable "MemPurge" prototype (#8454)
Summary:
Implement an experimental feature called "MemPurge", which consists in purging "garbage" bytes out of a memtable and reuse the memtable struct instead of making it immutable and eventually flushing its content to storage.
The prototype is by default deactivated and is not intended for use. It is intended for correctness and validation testing. At the moment, the "MemPurge" feature can be switched on by using the `options.experimental_allow_mempurge` flag. For this early stage, when the allow_mempurge flag is set to `true`, all the flush operations will be rerouted to perform a MemPurge. This is a temporary design decision that will give us the time to explore meaningful heuristics to use MemPurge at the right time for relevant workloads . Moreover, the current MemPurge operation only supports `Puts`, `Deletes`, `DeleteRange` operations, and handles `Iterators` as well as `CompactionFilter`s that are invoked at flush time .
Three unit tests are added to `db_flush_test.cc` to test if MemPurge works correctly (and checks that the previously mentioned operations are fully supported thoroughly tested).
One noticeable design decision is the timing of the MemPurge operation in the memtable workflow: for this prototype, the mempurge happens when the memtable is switched (and usually made immutable). This is an inefficient process because it implies that the entirety of the MemPurge operation happens while holding the db_mutex. Future commits will make the MemPurge operation a background task (akin to the regular flush operation) and aim at drastically enhancing the performance of this operation. The MemPurge is also not fully "WAL-compatible" yet, but when the WAL is full, or when the regular MemPurge operation fails (or when the purged memtable still needs to be flushed), a regular flush operation takes place. Later commits will also correct these behaviors.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8454
Reviewed By: anand1976
Differential Revision: D29433971
Pulled By: bjlemaire
fbshipit-source-id: 6af48213554e35048a7e03816955100a80a26dc5
4 years ago
|
|
|
// MEMPURGE: No need to change this, because new adds
|
|
|
|
// should still receive new sequence numbers.
|
Track WAL obsoletion when updating empty CF's log number (#7781)
Summary:
In the write path, there is an optimization: when a new WAL is created during SwitchMemtable, we update the internal log number of the empty column families to the new WAL. `FindObsoleteFiles` marks a WAL as obsolete if the WAL's log number is less than `VersionSet::MinLogNumberWithUnflushedData`. After updating the empty column families' internal log number, `VersionSet::MinLogNumberWithUnflushedData` might change, so some WALs might become obsolete to be purged from disk.
For example, consider there are 3 column families: 0, 1, 2:
1. initially, all the column families' log number is 1;
2. write some data to cf0, and flush cf0, but the flush is pending;
3. now a new WAL 2 is created;
4. write data to cf1 and WAL 2, now cf0's log number is 1, cf1's log number is 2, cf2's log number is 2 (because cf1 and cf2 are empty, so their log numbers will be set to the highest log number);
5. now cf0's flush hasn't finished, flush cf1, a new WAL 3 is created, and cf1's flush finishes, now cf0's log number is 1, cf1's log number is 3, cf2's log number is 3, since WAL 1 still contains data for the unflushed cf0, no WAL can be deleted from disk;
6. now cf0's flush finishes, cf0's log number is 2 (because when cf0 was switching memtable, WAL 3 does not exist yet), cf1's log number is 3, cf2's log number is 3, so WAL 1 can be purged from disk now, but WAL 2 still cannot because `MinLogNumberToKeep()` is 2;
7. write data to cf2 and WAL 3, because cf0 is empty, its log number is updated to 3, so now cf0's log number is 3, cf1's log number is 3, cf2's log number is 3;
8. now if the background threads want to purge obsolete files from disk, WAL 2 can be purged because `MinLogNumberToKeep()` is 3. But there are only two flush results written to MANIFEST: the first is for flushing cf1, and the `MinLogNumberToKeep` is 1, the second is for flushing cf0, and the `MinLogNumberToKeep` is 2. So without this PR, if the DB crashes at this point and try to recover, `WalSet` will still expect WAL 2 to exist.
When WAL tracking is enabled, we assume WALs will only become obsolete after a flush result is written to MANIFEST in `MemtableList::TryInstallMemtableFlushResults` (or its atomic flush counterpart). The above situation breaks this assumption.
This PR tracks WAL obsoletion if necessary before updating the empty column families' log numbers.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7781
Test Plan:
watch existing tests and stress tests to pass.
`make -j48 blackbox_crash_test` on devserver
Reviewed By: ltamasi
Differential Revision: D25631695
Pulled By: cheng-chang
fbshipit-source-id: ca7fff967bdb42204b84226063d909893bc0a4ec
4 years ago
|
|
|
cf->mem()->SetCreationSeq(versions_->LastSequence());
|
|
|
|
} // cf may become non-empty.
|
|
|
|
}
|
|
|
|
empty_cf_updated = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!empty_cf_updated) {
|
|
|
|
for (auto cf : *versions_->GetColumnFamilySet()) {
|
|
|
|
// all this is just optimization to delete logs that
|
|
|
|
// are no longer needed -- if CF is empty, that means it
|
|
|
|
// doesn't need that particular log to stay alive, so we just
|
|
|
|
// advance the log number. no need to persist this in the manifest
|
|
|
|
if (cf->IsEmpty()) {
|
|
|
|
if (creating_new_log) {
|
|
|
|
cf->SetLogNumber(logfile_number_);
|
|
|
|
}
|
|
|
|
cf->mem()->SetCreationSeq(versions_->LastSequence());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
cfd->mem()->SetNextLogNumber(logfile_number_);
|
|
|
|
assert(new_mem != nullptr);
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
4 years ago
|
|
|
cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
|
|
|
|
new_mem->Ref();
|
|
|
|
cfd->SetMemtable(new_mem);
|
|
|
|
InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
4 years ago
|
|
|
mutable_cf_options);
|
Memtable "MemPurge" prototype (#8454)
Summary:
Implement an experimental feature called "MemPurge", which consists in purging "garbage" bytes out of a memtable and reuse the memtable struct instead of making it immutable and eventually flushing its content to storage.
The prototype is by default deactivated and is not intended for use. It is intended for correctness and validation testing. At the moment, the "MemPurge" feature can be switched on by using the `options.experimental_allow_mempurge` flag. For this early stage, when the allow_mempurge flag is set to `true`, all the flush operations will be rerouted to perform a MemPurge. This is a temporary design decision that will give us the time to explore meaningful heuristics to use MemPurge at the right time for relevant workloads . Moreover, the current MemPurge operation only supports `Puts`, `Deletes`, `DeleteRange` operations, and handles `Iterators` as well as `CompactionFilter`s that are invoked at flush time .
Three unit tests are added to `db_flush_test.cc` to test if MemPurge works correctly (and checks that the previously mentioned operations are fully supported thoroughly tested).
One noticeable design decision is the timing of the MemPurge operation in the memtable workflow: for this prototype, the mempurge happens when the memtable is switched (and usually made immutable). This is an inefficient process because it implies that the entirety of the MemPurge operation happens while holding the db_mutex. Future commits will make the MemPurge operation a background task (akin to the regular flush operation) and aim at drastically enhancing the performance of this operation. The MemPurge is also not fully "WAL-compatible" yet, but when the WAL is full, or when the regular MemPurge operation fails (or when the purged memtable still needs to be flushed), a regular flush operation takes place. Later commits will also correct these behaviors.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8454
Reviewed By: anand1976
Differential Revision: D29433971
Pulled By: bjlemaire
fbshipit-source-id: 6af48213554e35048a7e03816955100a80a26dc5
4 years ago
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
// Notify client that memtable is sealed, now that we have successfully
|
|
|
|
// installed a new memtable
|
|
|
|
NotifyOnMemTableSealed(cfd, memtable_info);
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
// It is possible that we got here without checking the value of i_os, but
|
|
|
|
// that is okay. If we did, it most likely means that s was already an error.
|
|
|
|
// In any case, ignore any unchecked error for i_os here.
|
|
|
|
io_s.PermitUncheckedError();
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
size_t bsize =
|
|
|
|
static_cast<size_t>(write_buffer_size / 10 + write_buffer_size);
|
|
|
|
// Some users might set very high write_buffer_size and rely on
|
|
|
|
// max_total_wal_size or other parameters to control the WAL size.
|
|
|
|
if (mutable_db_options_.max_total_wal_size > 0) {
|
|
|
|
bsize = std::min<size_t>(
|
|
|
|
bsize, static_cast<size_t>(mutable_db_options_.max_total_wal_size));
|
|
|
|
}
|
|
|
|
if (immutable_db_options_.db_write_buffer_size > 0) {
|
|
|
|
bsize = std::min<size_t>(bsize, immutable_db_options_.db_write_buffer_size);
|
|
|
|
}
|
|
|
|
if (immutable_db_options_.write_buffer_manager &&
|
|
|
|
immutable_db_options_.write_buffer_manager->enabled()) {
|
|
|
|
bsize = std::min<size_t>(
|
|
|
|
bsize, immutable_db_options_.write_buffer_manager->buffer_size());
|
|
|
|
}
|
|
|
|
|
|
|
|
return bsize;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Default implementations of convenience methods that subclasses of DB
|
|
|
|
// can call if they wish
|
|
|
|
Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& value) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
// Pre-allocate size of write batch conservatively.
|
|
|
|
// 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
|
|
|
|
// and we allocate 11 extra bytes for key length, as well as value length.
|
|
|
|
WriteBatch batch(key.size() + value.size() + 24, 0 /* max_bytes */,
|
|
|
|
opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
Status s = batch.Put(column_family, key, value);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& ts, const Slice& value) {
|
|
|
|
ColumnFamilyHandle* default_cf = DefaultColumnFamily();
|
|
|
|
assert(default_cf);
|
|
|
|
const Comparator* const default_cf_ucmp = default_cf->GetComparator();
|
|
|
|
assert(default_cf_ucmp);
|
|
|
|
WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
|
|
|
|
opt.protection_bytes_per_key,
|
|
|
|
default_cf_ucmp->timestamp_size());
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
Status s = batch.Put(column_family, key, ts, value);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::PutEntity(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const WideColumns& columns) {
|
|
|
|
const ColumnFamilyHandle* const default_cf = DefaultColumnFamily();
|
|
|
|
assert(default_cf);
|
|
|
|
|
|
|
|
const Comparator* const default_cf_ucmp = default_cf->GetComparator();
|
|
|
|
assert(default_cf_ucmp);
|
|
|
|
|
|
|
|
WriteBatch batch(/* reserved_bytes */ 0, /* max_bytes */ 0,
|
|
|
|
options.protection_bytes_per_key,
|
|
|
|
default_cf_ucmp->timestamp_size());
|
|
|
|
|
|
|
|
const Status s = batch.PutEntity(column_family, key, columns);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Write(options, &batch);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) {
|
|
|
|
WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
|
|
|
|
opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
Status s = batch.Delete(column_family, key);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& ts) {
|
|
|
|
ColumnFamilyHandle* default_cf = DefaultColumnFamily();
|
|
|
|
assert(default_cf);
|
|
|
|
const Comparator* const default_cf_ucmp = default_cf->GetComparator();
|
|
|
|
assert(default_cf_ucmp);
|
|
|
|
WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
|
|
|
|
opt.protection_bytes_per_key,
|
|
|
|
default_cf_ucmp->timestamp_size());
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
Status s = batch.Delete(column_family, key, ts);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::SingleDelete(const WriteOptions& opt,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key) {
|
|
|
|
WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
|
|
|
|
opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
Status s = batch.SingleDelete(column_family, key);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
Status DB::SingleDelete(const WriteOptions& opt,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& ts) {
|
|
|
|
ColumnFamilyHandle* default_cf = DefaultColumnFamily();
|
|
|
|
assert(default_cf);
|
|
|
|
const Comparator* const default_cf_ucmp = default_cf->GetComparator();
|
|
|
|
assert(default_cf_ucmp);
|
|
|
|
WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
|
|
|
|
opt.protection_bytes_per_key,
|
|
|
|
default_cf_ucmp->timestamp_size());
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
Status s = batch.SingleDelete(column_family, key, ts);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::DeleteRange(const WriteOptions& opt,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& begin_key, const Slice& end_key) {
|
|
|
|
WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
|
|
|
|
opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
|
|
|
|
Status s = batch.DeleteRange(column_family, begin_key, end_key);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::DeleteRange(const WriteOptions& opt,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& begin_key, const Slice& end_key,
|
|
|
|
const Slice& ts) {
|
|
|
|
ColumnFamilyHandle* default_cf = DefaultColumnFamily();
|
|
|
|
assert(default_cf);
|
|
|
|
const Comparator* const default_cf_ucmp = default_cf->GetComparator();
|
|
|
|
assert(default_cf_ucmp);
|
|
|
|
WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
|
|
|
|
opt.protection_bytes_per_key,
|
|
|
|
default_cf_ucmp->timestamp_size());
|
|
|
|
Status s = batch.DeleteRange(column_family, begin_key, end_key, ts);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& value) {
|
|
|
|
WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
|
|
|
|
opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
|
|
|
|
Status s = batch.Merge(column_family, key, value);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|