You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
rocksdb/db/c.cc

6227 lines
205 KiB

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef ROCKSDB_LITE
#include "rocksdb/c.h"
#include <cstdlib>
#include <map>
#include <unordered_set>
#include <vector>
#include "port/port.h"
#include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/comparator.h"
#include "rocksdb/convenience.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/iterator.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/merge_operator.h"
#include "rocksdb/options.h"
#include "rocksdb/perf_context.h"
#include "rocksdb/rate_limiter.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/statistics.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
#include "rocksdb/universal_compaction.h"
#include "rocksdb/utilities/backup_engine.h"
#include "rocksdb/utilities/checkpoint.h"
#include "rocksdb/utilities/db_ttl.h"
#include "rocksdb/utilities/memory_util.h"
#include "rocksdb/utilities/optimistic_transaction_db.h"
#include "rocksdb/utilities/table_properties_collectors.h"
#include "rocksdb/utilities/transaction.h"
#include "rocksdb/utilities/transaction_db.h"
#include "rocksdb/utilities/write_batch_with_index.h"
#include "rocksdb/write_batch.h"
#include "utilities/merge_operators.h"
using ROCKSDB_NAMESPACE::BackupEngine;
using ROCKSDB_NAMESPACE::BackupEngineOptions;
using ROCKSDB_NAMESPACE::BackupID;
using ROCKSDB_NAMESPACE::BackupInfo;
using ROCKSDB_NAMESPACE::BatchResult;
using ROCKSDB_NAMESPACE::BlockBasedTableOptions;
using ROCKSDB_NAMESPACE::BottommostLevelCompaction;
using ROCKSDB_NAMESPACE::BytewiseComparator;
using ROCKSDB_NAMESPACE::Cache;
using ROCKSDB_NAMESPACE::Checkpoint;
using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor;
using ROCKSDB_NAMESPACE::ColumnFamilyHandle;
using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
using ROCKSDB_NAMESPACE::CompactionFilter;
using ROCKSDB_NAMESPACE::CompactionFilterFactory;
using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
using ROCKSDB_NAMESPACE::CompactRangeOptions;
using ROCKSDB_NAMESPACE::Comparator;
using ROCKSDB_NAMESPACE::CompressionType;
using ROCKSDB_NAMESPACE::CuckooTableOptions;
using ROCKSDB_NAMESPACE::DB;
using ROCKSDB_NAMESPACE::DBOptions;
using ROCKSDB_NAMESPACE::DbPath;
using ROCKSDB_NAMESPACE::Env;
using ROCKSDB_NAMESPACE::EnvOptions;
using ROCKSDB_NAMESPACE::FileLock;
using ROCKSDB_NAMESPACE::FilterPolicy;
using ROCKSDB_NAMESPACE::FlushOptions;
using ROCKSDB_NAMESPACE::InfoLogLevel;
using ROCKSDB_NAMESPACE::IngestExternalFileOptions;
using ROCKSDB_NAMESPACE::Iterator;
using ROCKSDB_NAMESPACE::LiveFileMetaData;
using ROCKSDB_NAMESPACE::Logger;
using ROCKSDB_NAMESPACE::LRUCacheOptions;
using ROCKSDB_NAMESPACE::MemoryAllocator;
using ROCKSDB_NAMESPACE::MemoryUtil;
using ROCKSDB_NAMESPACE::MergeOperator;
using ROCKSDB_NAMESPACE::NewBloomFilterPolicy;
using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory;
using ROCKSDB_NAMESPACE::NewGenericRateLimiter;
using ROCKSDB_NAMESPACE::NewLRUCache;
using ROCKSDB_NAMESPACE::NewRibbonFilterPolicy;
using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
using ROCKSDB_NAMESPACE::OptimisticTransactionOptions;
using ROCKSDB_NAMESPACE::Options;
using ROCKSDB_NAMESPACE::PerfContext;
using ROCKSDB_NAMESPACE::PerfLevel;
using ROCKSDB_NAMESPACE::PinnableSlice;
using ROCKSDB_NAMESPACE::RandomAccessFile;
using ROCKSDB_NAMESPACE::Range;
using ROCKSDB_NAMESPACE::RateLimiter;
using ROCKSDB_NAMESPACE::ReadOptions;
using ROCKSDB_NAMESPACE::RestoreOptions;
using ROCKSDB_NAMESPACE::SequentialFile;
using ROCKSDB_NAMESPACE::Slice;
using ROCKSDB_NAMESPACE::SliceParts;
using ROCKSDB_NAMESPACE::SliceTransform;
using ROCKSDB_NAMESPACE::Snapshot;
using ROCKSDB_NAMESPACE::SstFileWriter;
using ROCKSDB_NAMESPACE::Status;
using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory;
using ROCKSDB_NAMESPACE::Transaction;
using ROCKSDB_NAMESPACE::TransactionDB;
using ROCKSDB_NAMESPACE::TransactionDBOptions;
using ROCKSDB_NAMESPACE::TransactionLogIterator;
using ROCKSDB_NAMESPACE::TransactionOptions;
using ROCKSDB_NAMESPACE::WALRecoveryMode;
using ROCKSDB_NAMESPACE::WritableFile;
using ROCKSDB_NAMESPACE::WriteBatch;
using ROCKSDB_NAMESPACE::WriteBatchWithIndex;
using ROCKSDB_NAMESPACE::WriteOptions;
using std::vector;
using std::unordered_set;
extern "C" {
struct rocksdb_t { DB* rep; };
struct rocksdb_backup_engine_t { BackupEngine* rep; };
struct rocksdb_backup_engine_info_t { std::vector<BackupInfo> rep; };
struct rocksdb_restore_options_t { RestoreOptions rep; };
struct rocksdb_iterator_t { Iterator* rep; };
struct rocksdb_writebatch_t { WriteBatch rep; };
struct rocksdb_writebatch_wi_t { WriteBatchWithIndex* rep; };
struct rocksdb_snapshot_t { const Snapshot* rep; };
struct rocksdb_flushoptions_t { FlushOptions rep; };
struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; };
struct rocksdb_readoptions_t {
ReadOptions rep;
// stack variables to set pointers to in ReadOptions
Slice upper_bound;
Slice lower_bound;
Slice timestamp;
Slice iter_start_ts;
};
struct rocksdb_writeoptions_t {
WriteOptions rep;
};
struct rocksdb_options_t {
Options rep;
};
struct rocksdb_compactoptions_t {
CompactRangeOptions rep;
Slice full_history_ts_low;
};
struct rocksdb_block_based_table_options_t {
BlockBasedTableOptions rep;
};
struct rocksdb_cuckoo_table_options_t {
CuckooTableOptions rep;
};
struct rocksdb_seqfile_t { SequentialFile* rep; };
struct rocksdb_randomfile_t { RandomAccessFile* rep; };
struct rocksdb_writablefile_t { WritableFile* rep; };
struct rocksdb_wal_iterator_t { TransactionLogIterator* rep; };
struct rocksdb_wal_readoptions_t { TransactionLogIterator::ReadOptions rep; };
struct rocksdb_filelock_t { FileLock* rep; };
struct rocksdb_logger_t {
std::shared_ptr<Logger> rep;
};
struct rocksdb_lru_cache_options_t {
LRUCacheOptions rep;
};
struct rocksdb_memory_allocator_t {
std::shared_ptr<MemoryAllocator> rep;
};
struct rocksdb_cache_t {
std::shared_ptr<Cache> rep;
};
struct rocksdb_livefiles_t { std::vector<LiveFileMetaData> rep; };
struct rocksdb_column_family_handle_t { ColumnFamilyHandle* rep; };
struct rocksdb_envoptions_t { EnvOptions rep; };
struct rocksdb_ingestexternalfileoptions_t { IngestExternalFileOptions rep; };
struct rocksdb_sstfilewriter_t { SstFileWriter* rep; };
struct rocksdb_ratelimiter_t {
std::shared_ptr<RateLimiter> rep;
};
struct rocksdb_perfcontext_t { PerfContext* rep; };
struct rocksdb_pinnableslice_t {
PinnableSlice rep;
};
struct rocksdb_transactiondb_options_t {
TransactionDBOptions rep;
};
struct rocksdb_transactiondb_t {
TransactionDB* rep;
};
struct rocksdb_transaction_options_t {
TransactionOptions rep;
};
struct rocksdb_transaction_t {
Transaction* rep;
};
struct rocksdb_backup_engine_options_t {
BackupEngineOptions rep;
};
struct rocksdb_checkpoint_t {
Checkpoint* rep;
};
struct rocksdb_optimistictransactiondb_t {
OptimisticTransactionDB* rep;
};
struct rocksdb_optimistictransaction_options_t {
OptimisticTransactionOptions rep;
};
struct rocksdb_compactionfiltercontext_t {
CompactionFilter::Context rep;
};
struct rocksdb_compactionfilter_t : public CompactionFilter {
void* state_;
void (*destructor_)(void*);
unsigned char (*filter_)(
void*,
int level,
const char* key, size_t key_length,
const char* existing_value, size_t value_length,
char** new_value, size_t *new_value_length,
unsigned char* value_changed);
const char* (*name_)(void*);
unsigned char ignore_snapshots_;
~rocksdb_compactionfilter_t() override { (*destructor_)(state_); }
bool Filter(int level, const Slice& key, const Slice& existing_value,
std::string* new_value, bool* value_changed) const override {
Implement full filter for block based table. Summary: 1. Make filter_block.h a base class. Derive block_based_filter_block and full_filter_block. The previous one is the traditional filter block. The full_filter_block is newly added. It would generate a filter block that contain all the keys in SST file. 2. When querying a key, table would first check if full_filter is available. If not, it would go to the exact data block and check using block_based filter. 3. User could choose to use full_filter or tradional(block_based_filter). They would be stored in SST file with different meta index name. "filter.filter_policy" or "full_filter.filter_policy". Then, Table reader is able to know the fllter block type. 4. Some optimizations have been done for full_filter_block, thus it requires a different interface compared to the original one in filter_policy.h. 5. Actual implementation of filter bits coding/decoding is placed in util/bloom_impl.cc Benchmark: base commit 1d23b5c470844c1208301311f0889eca750431c0 Command: db_bench --db=/dev/shm/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --write_buffer_size=134217728 --max_write_buffer_number=2 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --verify_checksum=false --max_background_compactions=4 --use_plain_table=0 --memtablerep=prefix_hash --open_files=-1 --mmap_read=1 --mmap_write=0 --bloom_bits=10 --bloom_locality=1 --memtable_bloom_bits=500000 --compression_type=lz4 --num=393216000 --use_hash_search=1 --block_size=1024 --block_restart_interval=16 --use_existing_db=1 --threads=1 --benchmarks=readrandom —disable_auto_compactions=1 Read QPS increase for about 30% from 2230002 to 2991411. Test Plan: make all check valgrind db_test db_stress --use_block_based_filter = 0 ./auto_sanity_test.sh Reviewers: igor, yhchiang, ljin, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D20979
10 years ago
char* c_new_value = nullptr;
size_t new_value_length = 0;
unsigned char c_value_changed = 0;
unsigned char result = (*filter_)(
state_,
level,
key.data(), key.size(),
existing_value.data(), existing_value.size(),
&c_new_value, &new_value_length, &c_value_changed);
if (c_value_changed) {
new_value->assign(c_new_value, new_value_length);
*value_changed = true;
}
return result;
}
const char* Name() const override { return (*name_)(state_); }
bool IgnoreSnapshots() const override { return ignore_snapshots_; }
};
struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory {
void* state_;
void (*destructor_)(void*);
rocksdb_compactionfilter_t* (*create_compaction_filter_)(
void*, rocksdb_compactionfiltercontext_t* context);
const char* (*name_)(void*);
~rocksdb_compactionfilterfactory_t() override { (*destructor_)(state_); }
std::unique_ptr<CompactionFilter> CreateCompactionFilter(
const CompactionFilter::Context& context) override {
rocksdb_compactionfiltercontext_t ccontext;
ccontext.rep = context;
CompactionFilter* cf = (*create_compaction_filter_)(state_, &ccontext);
return std::unique_ptr<CompactionFilter>(cf);
}
const char* Name() const override { return (*name_)(state_); }
};
struct rocksdb_comparator_t : public Comparator {
void* state_;
void (*destructor_)(void*);
int (*compare_)(void*, const char* a, size_t alen, const char* b,
size_t blen);
const char* (*name_)(void*);
int (*compare_ts_)(void*, const char* a_ts, size_t a_tslen, const char* b_ts,
size_t b_tslen);
int (*compare_without_ts_)(void*, const char* a, size_t alen,
unsigned char a_has_ts, const char* b, size_t blen,
unsigned char b_has_ts);
rocksdb_comparator_t() : Comparator() {}
rocksdb_comparator_t(size_t ts_size) : Comparator(ts_size) {}
~rocksdb_comparator_t() override { (*destructor_)(state_); }
int Compare(const Slice& a, const Slice& b) const override {
return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
}
int CompareTimestamp(const Slice& a_ts, const Slice& b_ts) const override {
if (compare_ts_ == nullptr) {
return 0;
}
return (*compare_ts_)(state_, a_ts.data(), a_ts.size(), b_ts.data(),
b_ts.size());
}
int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
bool b_has_ts) const override {
if (compare_without_ts_ == nullptr) {
return Compare(a, b);
}
return (*compare_without_ts_)(state_, a.data(), a.size(), a_has_ts,
b.data(), b.size(), b_has_ts);
}
const char* Name() const override { return (*name_)(state_); }
// No-ops since the C binding does not support key shortening methods.
void FindShortestSeparator(std::string*, const Slice&) const override {}
void FindShortSuccessor(std::string* /*key*/) const override {}
};
struct rocksdb_filterpolicy_t : public FilterPolicy {
void* state_;
void (*destructor_)(void*);
const char* (*name_)(void*);
~rocksdb_filterpolicy_t() override { (*destructor_)(state_); }
const char* Name() const override { return (*name_)(state_); }
};
struct rocksdb_mergeoperator_t : public MergeOperator {
void* state_;
void (*destructor_)(void*);
const char* (*name_)(void*);
char* (*full_merge_)(
void*,
const char* key, size_t key_length,
const char* existing_value, size_t existing_value_length,
const char* const* operands_list, const size_t* operands_list_length,
int num_operands,
unsigned char* success, size_t* new_value_length);
char* (*partial_merge_)(void*, const char* key, size_t key_length,
const char* const* operands_list,
const size_t* operands_list_length, int num_operands,
unsigned char* success, size_t* new_value_length);
void (*delete_value_)(
void*,
const char* value, size_t value_length);
~rocksdb_mergeoperator_t() override { (*destructor_)(state_); }
const char* Name() const override { return (*name_)(state_); }
bool FullMergeV2(const MergeOperationInput& merge_in,
MergeOperationOutput* merge_out) const override {
Introduce FullMergeV2 (eliminate memcpy from merge operators) Summary: This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice> This diff is stacked on top of D56493 and D56511 In this diff we - Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future - Replace std::deque<std::string> with std::vector<Slice> to pass operands - Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187) - Allow FullMergeV2 output to be an existing operand ``` [Everything in Memtable | 10K operands | 10 KB each | 1 operand per key] DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000 [FullMergeV2] readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s [master] readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s ``` ``` [Everything in Memtable | 10K operands | 10 KB each | 10 operand per key] DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000 [FullMergeV2] readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s [master] readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s ``` ``` [Everything in Block cache | 10K operands | 10 KB each | 1 operand per key] [FullMergeV2] $ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s [master] readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s ``` ``` [Everything in Block cache | 10K operands | 10 KB each | 10 operand per key] DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions [FullMergeV2] readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s [master] readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s ``` Test Plan: COMPILE_WITH_ASAN=1 make check -j64 Reviewers: yhchiang, andrewkr, sdong Reviewed By: sdong Subscribers: lovro, andrewkr, dhruba Differential Revision: https://reviews.facebook.net/D57075
9 years ago
size_t n = merge_in.operand_list.size();
std::vector<const char*> operand_pointers(n);
std::vector<size_t> operand_sizes(n);
for (size_t i = 0; i < n; i++) {
Introduce FullMergeV2 (eliminate memcpy from merge operators) Summary: This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice> This diff is stacked on top of D56493 and D56511 In this diff we - Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future - Replace std::deque<std::string> with std::vector<Slice> to pass operands - Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187) - Allow FullMergeV2 output to be an existing operand ``` [Everything in Memtable | 10K operands | 10 KB each | 1 operand per key] DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000 [FullMergeV2] readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s [master] readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s ``` ``` [Everything in Memtable | 10K operands | 10 KB each | 10 operand per key] DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000 [FullMergeV2] readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s [master] readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s ``` ``` [Everything in Block cache | 10K operands | 10 KB each | 1 operand per key] [FullMergeV2] $ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s [master] readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s ``` ``` [Everything in Block cache | 10K operands | 10 KB each | 10 operand per key] DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions [FullMergeV2] readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s [master] readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s ``` Test Plan: COMPILE_WITH_ASAN=1 make check -j64 Reviewers: yhchiang, andrewkr, sdong Reviewed By: sdong Subscribers: lovro, andrewkr, dhruba Differential Revision: https://reviews.facebook.net/D57075
9 years ago
Slice operand(merge_in.operand_list[i]);
operand_pointers[i] = operand.data();
operand_sizes[i] = operand.size();
}
const char* existing_value_data = nullptr;
size_t existing_value_len = 0;
Introduce FullMergeV2 (eliminate memcpy from merge operators) Summary: This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice> This diff is stacked on top of D56493 and D56511 In this diff we - Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future - Replace std::deque<std::string> with std::vector<Slice> to pass operands - Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187) - Allow FullMergeV2 output to be an existing operand ``` [Everything in Memtable | 10K operands | 10 KB each | 1 operand per key] DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000 [FullMergeV2] readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s [master] readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s ``` ``` [Everything in Memtable | 10K operands | 10 KB each | 10 operand per key] DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000 [FullMergeV2] readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s [master] readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s ``` ``` [Everything in Block cache | 10K operands | 10 KB each | 1 operand per key] [FullMergeV2] $ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s [master] readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s ``` ``` [Everything in Block cache | 10K operands | 10 KB each | 10 operand per key] DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions [FullMergeV2] readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s [master] readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s ``` Test Plan: COMPILE_WITH_ASAN=1 make check -j64 Reviewers: yhchiang, andrewkr, sdong Reviewed By: sdong Subscribers: lovro, andrewkr, dhruba Differential Revision: https://reviews.facebook.net/D57075
9 years ago
if (merge_in.existing_value != nullptr) {
existing_value_data = merge_in.existing_value->data();
existing_value_len = merge_in.existing_value->size();
}
unsigned char success;
size_t new_value_len;
char* tmp_new_value = (*full_merge_)(
Introduce FullMergeV2 (eliminate memcpy from merge operators) Summary: This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice> This diff is stacked on top of D56493 and D56511 In this diff we - Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future - Replace std::deque<std::string> with std::vector<Slice> to pass operands - Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187) - Allow FullMergeV2 output to be an existing operand ``` [Everything in Memtable | 10K operands | 10 KB each | 1 operand per key] DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000 [FullMergeV2] readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s [master] readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s ``` ``` [Everything in Memtable | 10K operands | 10 KB each | 10 operand per key] DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000 [FullMergeV2] readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s [master] readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s ``` ``` [Everything in Block cache | 10K operands | 10 KB each | 1 operand per key] [FullMergeV2] $ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s [master] readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s ``` ``` [Everything in Block cache | 10K operands | 10 KB each | 10 operand per key] DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions [FullMergeV2] readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s [master] readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s ``` Test Plan: COMPILE_WITH_ASAN=1 make check -j64 Reviewers: yhchiang, andrewkr, sdong Reviewed By: sdong Subscribers: lovro, andrewkr, dhruba Differential Revision: https://reviews.facebook.net/D57075
9 years ago
state_, merge_in.key.data(), merge_in.key.size(), existing_value_data,
existing_value_len, &operand_pointers[0], &operand_sizes[0],
static_cast<int>(n), &success, &new_value_len);
merge_out->new_value.assign(tmp_new_value, new_value_len);
if (delete_value_ != nullptr) {
(*delete_value_)(state_, tmp_new_value, new_value_len);
} else {
free(tmp_new_value);
}
return success;
}
bool PartialMergeMulti(const Slice& key,
const std::deque<Slice>& operand_list,
std::string* new_value,
Logger* /*logger*/) const override {
size_t operand_count = operand_list.size();
std::vector<const char*> operand_pointers(operand_count);
std::vector<size_t> operand_sizes(operand_count);
for (size_t i = 0; i < operand_count; ++i) {
Slice operand(operand_list[i]);
operand_pointers[i] = operand.data();
operand_sizes[i] = operand.size();
}
unsigned char success;
size_t new_value_len;
char* tmp_new_value = (*partial_merge_)(
state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0],
static_cast<int>(operand_count), &success, &new_value_len);
new_value->assign(tmp_new_value, new_value_len);
if (delete_value_ != nullptr) {
(*delete_value_)(state_, tmp_new_value, new_value_len);
} else {
free(tmp_new_value);
}
return success;
}
};
struct rocksdb_dbpath_t {
DbPath rep;
};
struct rocksdb_env_t {
Env* rep;
bool is_default;
};
struct rocksdb_slicetransform_t : public SliceTransform {
void* state_;
void (*destructor_)(void*);
const char* (*name_)(void*);
char* (*transform_)(
void*,
const char* key, size_t length,
size_t* dst_length);
unsigned char (*in_domain_)(
void*,
const char* key, size_t length);
unsigned char (*in_range_)(
void*,
const char* key, size_t length);
~rocksdb_slicetransform_t() override { (*destructor_)(state_); }
const char* Name() const override { return (*name_)(state_); }
Slice Transform(const Slice& src) const override {
size_t len;
char* dst = (*transform_)(state_, src.data(), src.size(), &len);
return Slice(dst, len);
}
bool InDomain(const Slice& src) const override {
return (*in_domain_)(state_, src.data(), src.size());
}
bool InRange(const Slice& src) const override {
return (*in_range_)(state_, src.data(), src.size());
}
};
struct rocksdb_universal_compaction_options_t {
ROCKSDB_NAMESPACE::CompactionOptionsUniversal* rep;
};
static bool SaveError(char** errptr, const Status& s) {
assert(errptr != nullptr);
if (s.ok()) {
return false;
} else if (*errptr == nullptr) {
*errptr = strdup(s.ToString().c_str());
} else {
// TODO(sanjay): Merge with existing error?
// This is a bug if *errptr is not created by malloc()
free(*errptr);
*errptr = strdup(s.ToString().c_str());
}
return true;
}
static char* CopyString(const std::string& str) {
char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
memcpy(result, str.data(), sizeof(char) * str.size());
return result;
}
rocksdb_t* rocksdb_open(
const rocksdb_options_t* options,
const char* name,
char** errptr) {
DB* db;
if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
return nullptr;
}
rocksdb_t* result = new rocksdb_t;
result->rep = db;
return result;
}
rocksdb_t* rocksdb_open_with_ttl(
const rocksdb_options_t* options,
const char* name,
int ttl,
char** errptr) {
ROCKSDB_NAMESPACE::DBWithTTL* db;
if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open(
options->rep, std::string(name), &db, ttl))) {
return nullptr;
}
rocksdb_t* result = new rocksdb_t;
result->rep = db;
return result;
}
rocksdb_t* rocksdb_open_for_read_only(const rocksdb_options_t* options,
const char* name,
unsigned char error_if_wal_file_exists,
char** errptr) {
DB* db;
if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name),
&db, error_if_wal_file_exists))) {
return nullptr;
}
rocksdb_t* result = new rocksdb_t;
result->rep = db;
return result;
}
rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options,
const char* name,
const char* secondary_path,
char** errptr) {
DB* db;
if (SaveError(errptr,
DB::OpenAsSecondary(options->rep, std::string(name),
std::string(secondary_path), &db))) {
return nullptr;
}
rocksdb_t* result = new rocksdb_t;
result->rep = db;
return result;
}
rocksdb_backup_engine_t* rocksdb_backup_engine_open(
10 years ago
const rocksdb_options_t* options, const char* path, char** errptr) {
BackupEngine* be;
if (SaveError(errptr, BackupEngine::Open(
options->rep.env,
BackupEngineOptions(path, nullptr, true,
options->rep.info_log.get()),
&be))) {
return nullptr;
}
rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
result->rep = be;
return result;
}
rocksdb_backup_engine_t* rocksdb_backup_engine_open_opts(
const rocksdb_backup_engine_options_t* options, rocksdb_env_t* env,
char** errptr) {
BackupEngine* be;
if (SaveError(errptr, BackupEngine::Open(options->rep, env->rep, &be))) {
return nullptr;
}
rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
result->rep = be;
return result;
}
10 years ago
void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be,
rocksdb_t* db,
char** errptr) {
SaveError(errptr, be->rep->CreateNewBackup(db->rep));
}
void rocksdb_backup_engine_create_new_backup_flush(rocksdb_backup_engine_t* be,
rocksdb_t* db,
unsigned char flush_before_backup,
char** errptr) {
SaveError(errptr, be->rep->CreateNewBackup(db->rep, flush_before_backup));
}
void rocksdb_backup_engine_purge_old_backups(rocksdb_backup_engine_t* be,
uint32_t num_backups_to_keep,
char** errptr) {
SaveError(errptr, be->rep->PurgeOldBackups(num_backups_to_keep));
}
rocksdb_restore_options_t* rocksdb_restore_options_create() {
return new rocksdb_restore_options_t;
}
void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) {
delete opt;
}
10 years ago
void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt,
int v) {
opt->rep.keep_log_files = v;
}
void rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be,
uint32_t backup_id, char** errptr) {
SaveError(errptr, be->rep->VerifyBackup(static_cast<BackupID>(backup_id)));
}
void rocksdb_backup_engine_restore_db_from_latest_backup(
10 years ago
rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
const rocksdb_restore_options_t* restore_options, char** errptr) {
SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir),
std::string(wal_dir),
restore_options->rep));
}
void rocksdb_backup_engine_restore_db_from_backup(
rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
const rocksdb_restore_options_t* restore_options, const uint32_t backup_id,
char** errptr) {
SaveError(errptr, be->rep->RestoreDBFromBackup(backup_id, std::string(db_dir),
std::string(wal_dir),
restore_options->rep));
}
const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
rocksdb_backup_engine_t* be) {
rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t;
be->rep->GetBackupInfo(&result->rep);
return result;
}
10 years ago
int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) {
return static_cast<int>(info->rep.size());
}
int64_t rocksdb_backup_engine_info_timestamp(
10 years ago
const rocksdb_backup_engine_info_t* info, int index) {
return info->rep[index].timestamp;
}
uint32_t rocksdb_backup_engine_info_backup_id(
10 years ago
const rocksdb_backup_engine_info_t* info, int index) {
return info->rep[index].backup_id;
}
uint64_t rocksdb_backup_engine_info_size(
10 years ago
const rocksdb_backup_engine_info_t* info, int index) {
return info->rep[index].size;
}
uint32_t rocksdb_backup_engine_info_number_files(
10 years ago
const rocksdb_backup_engine_info_t* info, int index) {
return info->rep[index].number_files;
}
void rocksdb_backup_engine_info_destroy(
const rocksdb_backup_engine_info_t* info) {
delete info;
}
10 years ago
void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) {
delete be->rep;
delete be;
}
rocksdb_backup_engine_options_t* rocksdb_backup_engine_options_create(
const char* backup_dir) {
return new rocksdb_backup_engine_options_t{
BackupEngineOptions(std::string(backup_dir))};
}
void rocksdb_backup_engine_options_set_backup_dir(
rocksdb_backup_engine_options_t* options, const char* backup_dir) {
options->rep.backup_dir = std::string(backup_dir);
}
void rocksdb_backup_engine_options_set_env(
rocksdb_backup_engine_options_t* options, rocksdb_env_t* env) {
options->rep.backup_env = (env ? env->rep : nullptr);
}
void rocksdb_backup_engine_options_set_share_table_files(
rocksdb_backup_engine_options_t* options, unsigned char val) {
options->rep.share_table_files = val;
}
unsigned char rocksdb_backup_engine_options_get_share_table_files(
rocksdb_backup_engine_options_t* options) {
return options->rep.share_table_files;
}
void rocksdb_backup_engine_options_set_sync(
rocksdb_backup_engine_options_t* options, unsigned char val) {
options->rep.sync = val;
}
unsigned char rocksdb_backup_engine_options_get_sync(
rocksdb_backup_engine_options_t* options) {
return options->rep.sync;
}
void rocksdb_backup_engine_options_set_destroy_old_data(
rocksdb_backup_engine_options_t* options, unsigned char val) {
options->rep.destroy_old_data = val;
}
unsigned char rocksdb_backup_engine_options_get_destroy_old_data(
rocksdb_backup_engine_options_t* options) {
return options->rep.destroy_old_data;
}
void rocksdb_backup_engine_options_set_backup_log_files(
rocksdb_backup_engine_options_t* options, unsigned char val) {
options->rep.backup_log_files = val;
}
unsigned char rocksdb_backup_engine_options_get_backup_log_files(
rocksdb_backup_engine_options_t* options) {
return options->rep.backup_log_files;
}
void rocksdb_backup_engine_options_set_backup_rate_limit(
rocksdb_backup_engine_options_t* options, uint64_t limit) {
options->rep.backup_rate_limit = limit;
}
uint64_t rocksdb_backup_engine_options_get_backup_rate_limit(
rocksdb_backup_engine_options_t* options) {
return options->rep.backup_rate_limit;
}
void rocksdb_backup_engine_options_set_restore_rate_limit(
rocksdb_backup_engine_options_t* options, uint64_t limit) {
options->rep.restore_rate_limit = limit;
}
uint64_t rocksdb_backup_engine_options_get_restore_rate_limit(
rocksdb_backup_engine_options_t* options) {
return options->rep.restore_rate_limit;
}
void rocksdb_backup_engine_options_set_max_background_operations(
rocksdb_backup_engine_options_t* options, int val) {
options->rep.max_background_operations = val;
}
int rocksdb_backup_engine_options_get_max_background_operations(
rocksdb_backup_engine_options_t* options) {
return options->rep.max_background_operations;
}
void rocksdb_backup_engine_options_set_callback_trigger_interval_size(
rocksdb_backup_engine_options_t* options, uint64_t size) {
options->rep.callback_trigger_interval_size = size;
}
uint64_t rocksdb_backup_engine_options_get_callback_trigger_interval_size(
rocksdb_backup_engine_options_t* options) {
return options->rep.callback_trigger_interval_size;
}
void rocksdb_backup_engine_options_set_max_valid_backups_to_open(
rocksdb_backup_engine_options_t* options, int val) {
options->rep.max_valid_backups_to_open = val;
}
int rocksdb_backup_engine_options_get_max_valid_backups_to_open(
rocksdb_backup_engine_options_t* options) {
return options->rep.max_valid_backups_to_open;
}
void rocksdb_backup_engine_options_set_share_files_with_checksum_naming(
rocksdb_backup_engine_options_t* options, int val) {
options->rep.share_files_with_checksum_naming =
static_cast<BackupEngineOptions::ShareFilesNaming>(val);
}
int rocksdb_backup_engine_options_get_share_files_with_checksum_naming(
rocksdb_backup_engine_options_t* options) {
return static_cast<int>(options->rep.share_files_with_checksum_naming);
}
void rocksdb_backup_engine_options_destroy(
rocksdb_backup_engine_options_t* options) {
delete options;
}
rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db,
char** errptr) {
Checkpoint* checkpoint;
if (SaveError(errptr, Checkpoint::Create(db->rep, &checkpoint))) {
return nullptr;
}
rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
result->rep = checkpoint;
return result;
}
void rocksdb_checkpoint_create(rocksdb_checkpoint_t* checkpoint,
const char* checkpoint_dir,
uint64_t log_size_for_flush, char** errptr) {
SaveError(errptr, checkpoint->rep->CreateCheckpoint(
std::string(checkpoint_dir), log_size_for_flush));
}
void rocksdb_checkpoint_object_destroy(rocksdb_checkpoint_t* checkpoint) {
delete checkpoint->rep;
delete checkpoint;
}
void rocksdb_close(rocksdb_t* db) {
delete db->rep;
delete db;
}
void rocksdb_options_set_uint64add_merge_operator(rocksdb_options_t* opt) {
opt->rep.merge_operator =
ROCKSDB_NAMESPACE::MergeOperators::CreateUInt64AddOperator();
}
rocksdb_t* rocksdb_open_and_trim_history(
const rocksdb_options_t* db_options, const char* name,
int num_column_families, const char* const* column_family_names,
const rocksdb_options_t* const* column_family_options,
rocksdb_column_family_handle_t** column_family_handles, char* trim_ts,
size_t trim_tslen, char** errptr) {
std::vector<ColumnFamilyDescriptor> column_families;
for (int i = 0; i < num_column_families; i++) {
column_families.push_back(ColumnFamilyDescriptor(
std::string(column_family_names[i]),
ColumnFamilyOptions(column_family_options[i]->rep)));
}
std::string trim_ts_(trim_ts, trim_tslen);
DB* db;
std::vector<ColumnFamilyHandle*> handles;
if (SaveError(errptr, DB::OpenAndTrimHistory(
DBOptions(db_options->rep), std::string(name),
column_families, &handles, &db, trim_ts_))) {
return nullptr;
}
for (size_t i = 0; i < handles.size(); i++) {
rocksdb_column_family_handle_t* c_handle =
new rocksdb_column_family_handle_t;
c_handle->rep = handles[i];
column_family_handles[i] = c_handle;
}
rocksdb_t* result = new rocksdb_t;
result->rep = db;
return result;
}
rocksdb_t* rocksdb_open_column_families(
const rocksdb_options_t* db_options, const char* name,
int num_column_families, const char* const* column_family_names,
const rocksdb_options_t* const* column_family_options,
rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
std::vector<ColumnFamilyDescriptor> column_families;
for (int i = 0; i < num_column_families; i++) {
column_families.push_back(ColumnFamilyDescriptor(
std::string(column_family_names[i]),
ColumnFamilyOptions(column_family_options[i]->rep)));
}
DB* db;
std::vector<ColumnFamilyHandle*> handles;
if (SaveError(errptr, DB::Open(DBOptions(db_options->rep),
std::string(name), column_families, &handles, &db))) {
return nullptr;
}
for (size_t i = 0; i < handles.size(); i++) {
rocksdb_column_family_handle_t* c_handle = new rocksdb_column_family_handle_t;
c_handle->rep = handles[i];
column_family_handles[i] = c_handle;
}
rocksdb_t* result = new rocksdb_t;
result->rep = db;
return result;
}
rocksdb_t* rocksdb_open_column_families_with_ttl(
const rocksdb_options_t* db_options, const char* name,
int num_column_families, const char* const* column_family_names,
const rocksdb_options_t* const* column_family_options,
rocksdb_column_family_handle_t** column_family_handles, const int* ttls,
char** errptr) {
std::vector<int32_t> ttls_vec;
std::vector<ColumnFamilyDescriptor> column_families;
for (int i = 0; i < num_column_families; i++) {
ttls_vec.push_back(ttls[i]);
column_families.push_back(ColumnFamilyDescriptor(
std::string(column_family_names[i]),
ColumnFamilyOptions(column_family_options[i]->rep)));
}
ROCKSDB_NAMESPACE::DBWithTTL* db;
std::vector<ColumnFamilyHandle*> handles;
if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open(
DBOptions(db_options->rep), std::string(name),
column_families, &handles, &db, ttls_vec))) {
return nullptr;
}
for (size_t i = 0; i < handles.size(); i++) {
rocksdb_column_family_handle_t* c_handle =
new rocksdb_column_family_handle_t;
c_handle->rep = handles[i];
column_family_handles[i] = c_handle;
}
rocksdb_t* result = new rocksdb_t;
result->rep = db;
return result;
}
rocksdb_t* rocksdb_open_for_read_only_column_families(
const rocksdb_options_t* db_options, const char* name,
int num_column_families, const char* const* column_family_names,
const rocksdb_options_t* const* column_family_options,
rocksdb_column_family_handle_t** column_family_handles,
unsigned char error_if_wal_file_exists, char** errptr) {
std::vector<ColumnFamilyDescriptor> column_families;
for (int i = 0; i < num_column_families; i++) {
column_families.push_back(ColumnFamilyDescriptor(
std::string(column_family_names[i]),
ColumnFamilyOptions(column_family_options[i]->rep)));
}
DB* db;
std::vector<ColumnFamilyHandle*> handles;
if (SaveError(errptr,
DB::OpenForReadOnly(DBOptions(db_options->rep),
std::string(name), column_families,
&handles, &db, error_if_wal_file_exists))) {
return nullptr;
}
for (size_t i = 0; i < handles.size(); i++) {
rocksdb_column_family_handle_t* c_handle = new rocksdb_column_family_handle_t;
c_handle->rep = handles[i];
column_family_handles[i] = c_handle;
}
rocksdb_t* result = new rocksdb_t;
result->rep = db;
return result;
}
rocksdb_t* rocksdb_open_as_secondary_column_families(
const rocksdb_options_t* db_options, const char* name,
const char* secondary_path, int num_column_families,
const char* const* column_family_names,
const rocksdb_options_t* const* column_family_options,
rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
std::vector<ColumnFamilyDescriptor> column_families;
for (int i = 0; i != num_column_families; ++i) {
column_families.emplace_back(
std::string(column_family_names[i]),
ColumnFamilyOptions(column_family_options[i]->rep));
}
DB* db;
std::vector<ColumnFamilyHandle*> handles;
if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep),
std::string(name),
std::string(secondary_path),
column_families, &handles, &db))) {
return nullptr;
}
for (size_t i = 0; i != handles.size(); ++i) {
rocksdb_column_family_handle_t* c_handle =
new rocksdb_column_family_handle_t;
c_handle->rep = handles[i];
column_family_handles[i] = c_handle;
}
rocksdb_t* result = new rocksdb_t;
result->rep = db;
return result;
}
char** rocksdb_list_column_families(
const rocksdb_options_t* options,
const char* name,
size_t* lencfs,
char** errptr) {
std::vector<std::string> fams;
SaveError(errptr,
DB::ListColumnFamilies(DBOptions(options->rep),
std::string(name), &fams));
*lencfs = fams.size();
char** column_families = static_cast<char**>(malloc(sizeof(char*) * fams.size()));
for (size_t i = 0; i < fams.size(); i++) {
column_families[i] = strdup(fams[i].c_str());
}
return column_families;
}
void rocksdb_list_column_families_destroy(char** list, size_t len) {
for (size_t i = 0; i < len; ++i) {
11 years ago
free(list[i]);
}
11 years ago
free(list);
}
rocksdb_column_family_handle_t* rocksdb_create_column_family(
rocksdb_t* db,
const rocksdb_options_t* column_family_options,
const char* column_family_name,
char** errptr) {
rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
SaveError(errptr,
db->rep->CreateColumnFamily(ColumnFamilyOptions(column_family_options->rep),
std::string(column_family_name), &(handle->rep)));
return handle;
}
rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl(
rocksdb_t* db, const rocksdb_options_t* column_family_options,
const char* column_family_name, int ttl, char** errptr) {
ROCKSDB_NAMESPACE::DBWithTTL* db_with_ttl =
static_cast<ROCKSDB_NAMESPACE::DBWithTTL*>(db->rep);
rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
SaveError(errptr, db_with_ttl->CreateColumnFamilyWithTtl(
ColumnFamilyOptions(column_family_options->rep),
std::string(column_family_name), &(handle->rep), ttl));
return handle;
}
void rocksdb_drop_column_family(
rocksdb_t* db,
rocksdb_column_family_handle_t* handle,
char** errptr) {
SaveError(errptr, db->rep->DropColumnFamily(handle->rep));
}
void rocksdb_column_family_handle_destroy(rocksdb_column_family_handle_t* handle) {
delete handle->rep;
delete handle;
}
void rocksdb_put(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr) {
SaveError(errptr,
db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
}
void rocksdb_put_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen, const char* val,
size_t vallen, char** errptr) {
SaveError(errptr, db->rep->Put(options->rep, column_family->rep,
Slice(key, keylen), Slice(val, vallen)));
}
void rocksdb_put_with_ts(rocksdb_t* db, const rocksdb_writeoptions_t* options,
const char* key, size_t keylen, const char* ts,
size_t tslen, const char* val, size_t vallen,
char** errptr) {
SaveError(errptr, db->rep->Put(options->rep, Slice(key, keylen),
Slice(ts, tslen), Slice(val, vallen)));
}
void rocksdb_put_cf_with_ts(rocksdb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen, const char* ts,
size_t tslen, const char* val, size_t vallen,
char** errptr) {
SaveError(errptr,
db->rep->Put(options->rep, column_family->rep, Slice(key, keylen),
Slice(ts, tslen), Slice(val, vallen)));
}
void rocksdb_delete(rocksdb_t* db, const rocksdb_writeoptions_t* options,
const char* key, size_t keylen, char** errptr) {
SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
}
void rocksdb_delete_cf(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen,
char** errptr) {
SaveError(errptr, db->rep->Delete(options->rep, column_family->rep,
Slice(key, keylen)));
}
void rocksdb_delete_with_ts(rocksdb_t* db,
const rocksdb_writeoptions_t* options,
const char* key, size_t keylen, const char* ts,
size_t tslen, char** errptr) {
SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen),
Slice(ts, tslen)));
}
void rocksdb_delete_cf_with_ts(rocksdb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen, const char* ts,
size_t tslen, char** errptr) {
SaveError(errptr, db->rep->Delete(options->rep, column_family->rep,
Slice(key, keylen), Slice(ts, tslen)));
}
void rocksdb_singledelete(rocksdb_t* db, const rocksdb_writeoptions_t* options,
const char* key, size_t keylen, char** errptr) {
SaveError(errptr, db->rep->SingleDelete(options->rep, Slice(key, keylen)));
}
void rocksdb_singledelete_cf(rocksdb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen, char** errptr) {
SaveError(errptr, db->rep->SingleDelete(options->rep, column_family->rep,
Slice(key, keylen)));
}
void rocksdb_singledelete_with_ts(rocksdb_t* db,
const rocksdb_writeoptions_t* options,
const char* key, size_t keylen,
const char* ts, size_t tslen, char** errptr) {
SaveError(errptr, db->rep->SingleDelete(options->rep, Slice(key, keylen),
Slice(ts, tslen)));
}
void rocksdb_singledelete_cf_with_ts(
rocksdb_t* db, const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family, const char* key,
size_t keylen, const char* ts, size_t tslen, char** errptr) {
SaveError(errptr,
db->rep->SingleDelete(options->rep, column_family->rep,
Slice(key, keylen), Slice(ts, tslen)));
}
void rocksdb_increase_full_history_ts_low(
rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
const char* ts_low, size_t ts_lowlen, char** errptr) {
std::string ts(ts_low, ts_lowlen);
SaveError(errptr, db->rep->IncreaseFullHistoryTsLow(column_family->rep, ts));
}
char* rocksdb_get_full_history_ts_low(
rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
size_t* ts_len, char** errptr) {
char* result = nullptr;
std::string tmp;
Status s = db->rep->GetFullHistoryTsLow(column_family->rep, &tmp);
if (s.ok()) {
*ts_len = tmp.size();
result = CopyString(tmp);
} else {
*ts_len = 0;
SaveError(errptr, s);
}
return result;
}
void rocksdb_delete_range_cf(rocksdb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* start_key, size_t start_key_len,
const char* end_key, size_t end_key_len,
char** errptr) {
SaveError(errptr, db->rep->DeleteRange(options->rep, column_family->rep,
Slice(start_key, start_key_len),
Slice(end_key, end_key_len)));
}
void rocksdb_merge(rocksdb_t* db, const rocksdb_writeoptions_t* options,
const char* key, size_t keylen, const char* val,
size_t vallen, char** errptr) {
SaveError(errptr, db->rep->Merge(options->rep, Slice(key, keylen),
Slice(val, vallen)));
}
void rocksdb_merge_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen, const char* val,
size_t vallen, char** errptr) {
SaveError(errptr,
db->rep->Merge(options->rep, column_family->rep,
Slice(key, keylen), Slice(val, vallen)));
}
void rocksdb_write(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_writebatch_t* batch,
char** errptr) {
SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
}
char* rocksdb_get(
rocksdb_t* db,
const rocksdb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr) {
char* result = nullptr;
std::string tmp;
Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
if (s.ok()) {
*vallen = tmp.size();
result = CopyString(tmp);
} else {
*vallen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
char* rocksdb_get_cf(
rocksdb_t* db,
const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen,
size_t* vallen,
char** errptr) {
char* result = nullptr;
std::string tmp;
Status s = db->rep->Get(options->rep, column_family->rep,
Slice(key, keylen), &tmp);
if (s.ok()) {
*vallen = tmp.size();
result = CopyString(tmp);
} else {
*vallen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
char* rocksdb_get_with_ts(rocksdb_t* db, const rocksdb_readoptions_t* options,
const char* key, size_t keylen, size_t* vallen,
char** ts, size_t* tslen, char** errptr) {
char* result = nullptr;
std::string tmp_val;
std::string tmp_ts;
Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp_val, &tmp_ts);
if (s.ok()) {
*vallen = tmp_val.size();
result = CopyString(tmp_val);
*tslen = tmp_ts.size();
*ts = CopyString(tmp_ts);
} else {
*vallen = 0;
*tslen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
char* rocksdb_get_cf_with_ts(rocksdb_t* db,
const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen, size_t* vallen,
char** ts, size_t* tslen, char** errptr) {
char* result = nullptr;
std::string tmp;
std::string tmp_ts;
Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
&tmp, &tmp_ts);
if (s.ok()) {
*vallen = tmp.size();
result = CopyString(tmp);
*tslen = tmp_ts.size();
*ts = CopyString(tmp_ts);
} else {
*vallen = 0;
*tslen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
void rocksdb_multi_get(rocksdb_t* db, const rocksdb_readoptions_t* options,
size_t num_keys, const char* const* keys_list,
const size_t* keys_list_sizes, char** values_list,
size_t* values_list_sizes, char** errs) {
std::vector<Slice> keys(num_keys);
for (size_t i = 0; i < num_keys; i++) {
keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<std::string> values(num_keys);
std::vector<Status> statuses = db->rep->MultiGet(options->rep, keys, &values);
for (size_t i = 0; i < num_keys; i++) {
if (statuses[i].ok()) {
values_list[i] = CopyString(values[i]);
values_list_sizes[i] = values[i].size();
errs[i] = nullptr;
} else {
values_list[i] = nullptr;
values_list_sizes[i] = 0;
if (!statuses[i].IsNotFound()) {
errs[i] = strdup(statuses[i].ToString().c_str());
} else {
errs[i] = nullptr;
}
}
}
}
void rocksdb_multi_get_with_ts(rocksdb_t* db,
const rocksdb_readoptions_t* options,
size_t num_keys, const char* const* keys_list,
const size_t* keys_list_sizes,
char** values_list, size_t* values_list_sizes,
char** timestamp_list,
size_t* timestamp_list_sizes, char** errs) {
std::vector<Slice> keys(num_keys);
for (size_t i = 0; i < num_keys; i++) {
keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<std::string> values(num_keys);
std::vector<std::string> timestamps(num_keys);
std::vector<Status> statuses =
db->rep->MultiGet(options->rep, keys, &values, &timestamps);
for (size_t i = 0; i < num_keys; i++) {
if (statuses[i].ok()) {
values_list[i] = CopyString(values[i]);
values_list_sizes[i] = values[i].size();
timestamp_list[i] = CopyString(timestamps[i]);
timestamp_list_sizes[i] = timestamps[i].size();
errs[i] = nullptr;
} else {
values_list[i] = nullptr;
values_list_sizes[i] = 0;
timestamp_list[i] = nullptr;
timestamp_list_sizes[i] = 0;
if (!statuses[i].IsNotFound()) {
errs[i] = strdup(statuses[i].ToString().c_str());
} else {
errs[i] = nullptr;
}
}
}
}
void rocksdb_multi_get_cf(
rocksdb_t* db, const rocksdb_readoptions_t* options,
const rocksdb_column_family_handle_t* const* column_families,
size_t num_keys, const char* const* keys_list,
const size_t* keys_list_sizes, char** values_list,
size_t* values_list_sizes, char** errs) {
std::vector<Slice> keys(num_keys);
std::vector<ColumnFamilyHandle*> cfs(num_keys);
for (size_t i = 0; i < num_keys; i++) {
keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
cfs[i] = column_families[i]->rep;
}
std::vector<std::string> values(num_keys);
std::vector<Status> statuses =
db->rep->MultiGet(options->rep, cfs, keys, &values);
for (size_t i = 0; i < num_keys; i++) {
if (statuses[i].ok()) {
values_list[i] = CopyString(values[i]);
values_list_sizes[i] = values[i].size();
errs[i] = nullptr;
} else {
values_list[i] = nullptr;
values_list_sizes[i] = 0;
if (!statuses[i].IsNotFound()) {
errs[i] = strdup(statuses[i].ToString().c_str());
} else {
errs[i] = nullptr;
}
}
}
}
void rocksdb_multi_get_cf_with_ts(
rocksdb_t* db, const rocksdb_readoptions_t* options,
const rocksdb_column_family_handle_t* const* column_families,
size_t num_keys, const char* const* keys_list,
const size_t* keys_list_sizes, char** values_list,
size_t* values_list_sizes, char** timestamps_list,
size_t* timestamps_list_sizes, char** errs) {
std::vector<Slice> keys(num_keys);
std::vector<ColumnFamilyHandle*> cfs(num_keys);
for (size_t i = 0; i < num_keys; i++) {
keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
cfs[i] = column_families[i]->rep;
}
std::vector<std::string> values(num_keys);
std::vector<std::string> timestamps(num_keys);
std::vector<Status> statuses =
db->rep->MultiGet(options->rep, cfs, keys, &values, &timestamps);
for (size_t i = 0; i < num_keys; i++) {
if (statuses[i].ok()) {
values_list[i] = CopyString(values[i]);
values_list_sizes[i] = values[i].size();
timestamps_list[i] = CopyString(timestamps[i]);
timestamps_list_sizes[i] = timestamps[i].size();
errs[i] = nullptr;
} else {
values_list[i] = nullptr;
values_list_sizes[i] = 0;
timestamps_list[i] = nullptr;
timestamps_list_sizes[i] = 0;
if (!statuses[i].IsNotFound()) {
errs[i] = strdup(statuses[i].ToString().c_str());
} else {
errs[i] = nullptr;
}
}
}
}
void rocksdb_batched_multi_get_cf(rocksdb_t* db,
const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family,
size_t num_keys, const char* const* keys_list,
const size_t* keys_list_sizes,
rocksdb_pinnableslice_t** values, char** errs,
const bool sorted_input) {
Slice* key_slices = new Slice[num_keys];
PinnableSlice* value_slices = new PinnableSlice[num_keys];
Status* statuses = new Status[num_keys];
for (size_t i = 0; i < num_keys; ++i) {
key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
db->rep->MultiGet(options->rep, column_family->rep, num_keys, key_slices,
value_slices, statuses, sorted_input);
for (size_t i = 0; i < num_keys; ++i) {
if (statuses[i].ok()) {
values[i] = new (rocksdb_pinnableslice_t);
values[i]->rep = std::move(value_slices[i]);
errs[i] = nullptr;
} else {
values[i] = nullptr;
if (!statuses[i].IsNotFound()) {
errs[i] = strdup(statuses[i].ToString().c_str());
} else {
errs[i] = nullptr;
}
}
}
delete[] key_slices;
delete[] value_slices;
delete[] statuses;
}
unsigned char rocksdb_key_may_exist(rocksdb_t* db,
const rocksdb_readoptions_t* options,
const char* key, size_t key_len,
char** value, size_t* val_len,
const char* timestamp, size_t timestamp_len,
unsigned char* value_found) {
std::string tmp;
std::string time;
if (timestamp) {
time.assign(timestamp, timestamp_len);
}
bool found = false;
const bool result = db->rep->KeyMayExist(options->rep, Slice(key, key_len),
&tmp, timestamp ? &time : nullptr,
value_found ? &found : nullptr);
if (value_found) {
*value_found = found;
if (found) {
*val_len = tmp.size();
*value = CopyString(tmp);
}
}
return result;
}
unsigned char rocksdb_key_may_exist_cf(
rocksdb_t* db, const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family, const char* key,
size_t key_len, char** value, size_t* val_len, const char* timestamp,
size_t timestamp_len, unsigned char* value_found) {
std::string tmp;
std::string time;
if (timestamp) {
time.assign(timestamp, timestamp_len);
}
bool found = false;
const bool result = db->rep->KeyMayExist(
options->rep, column_family->rep, Slice(key, key_len), &tmp,
timestamp ? &time : nullptr, value_found ? &found : nullptr);
if (value_found) {
*value_found = found;
if (found) {
*val_len = tmp.size();
*value = CopyString(tmp);
}
}
return result;
}
rocksdb_iterator_t* rocksdb_create_iterator(
rocksdb_t* db,
const rocksdb_readoptions_t* options) {
rocksdb_iterator_t* result = new rocksdb_iterator_t;
result->rep = db->rep->NewIterator(options->rep);
return result;
}
rocksdb_wal_iterator_t* rocksdb_get_updates_since(
rocksdb_t* db, uint64_t seq_number,
const rocksdb_wal_readoptions_t* options,
char** errptr) {
std::unique_ptr<TransactionLogIterator> iter;
TransactionLogIterator::ReadOptions ro;
if (options!=nullptr) {
ro = options->rep;
}
if (SaveError(errptr, db->rep->GetUpdatesSince(seq_number, &iter, ro))) {
return nullptr;
}
rocksdb_wal_iterator_t* result = new rocksdb_wal_iterator_t;
result->rep = iter.release();
return result;
}
void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter) {
iter->rep->Next();
}
unsigned char rocksdb_wal_iter_valid(const rocksdb_wal_iterator_t* iter) {
return iter->rep->Valid();
}
void rocksdb_wal_iter_status (const rocksdb_wal_iterator_t* iter, char** errptr) {
SaveError(errptr, iter->rep->status());
}
void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) {
delete iter->rep;
delete iter;
}
rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) {
rocksdb_writebatch_t* result = rocksdb_writebatch_create();
BatchResult wal_batch = iter->rep->GetBatch();
result->rep = std::move(*wal_batch.writeBatchPtr);
if (seq != nullptr) {
*seq = wal_batch.sequence;
}
return result;
}
uint64_t rocksdb_get_latest_sequence_number (rocksdb_t *db) {
return db->rep->GetLatestSequenceNumber();
}
rocksdb_iterator_t* rocksdb_create_iterator_cf(
rocksdb_t* db,
const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family) {
rocksdb_iterator_t* result = new rocksdb_iterator_t;
result->rep = db->rep->NewIterator(options->rep, column_family->rep);
return result;
}
void rocksdb_create_iterators(
rocksdb_t *db,
rocksdb_readoptions_t* opts,
rocksdb_column_family_handle_t** column_families,
rocksdb_iterator_t** iterators,
size_t size,
char** errptr) {
std::vector<ColumnFamilyHandle*> column_families_vec;
for (size_t i = 0; i < size; i++) {
column_families_vec.push_back(column_families[i]->rep);
}
std::vector<Iterator*> res;
Status status = db->rep->NewIterators(opts->rep, column_families_vec, &res);
assert(res.size() == size);
if (SaveError(errptr, status)) {
return;
}
for (size_t i = 0; i < size; i++) {
iterators[i] = new rocksdb_iterator_t;
iterators[i]->rep = res[i];
}
}
const rocksdb_snapshot_t* rocksdb_create_snapshot(
rocksdb_t* db) {
rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
result->rep = db->rep->GetSnapshot();
return result;
}
void rocksdb_release_snapshot(
rocksdb_t* db,
const rocksdb_snapshot_t* snapshot) {
db->rep->ReleaseSnapshot(snapshot->rep);
delete snapshot;
}
char* rocksdb_property_value(
rocksdb_t* db,
const char* propname) {
std::string tmp;
if (db->rep->GetProperty(Slice(propname), &tmp)) {
// We use strdup() since we expect human readable output.
return strdup(tmp.c_str());
} else {
return nullptr;
}
}
int rocksdb_property_int(
rocksdb_t* db,
const char* propname,
uint64_t *out_val) {
if (db->rep->GetIntProperty(Slice(propname), out_val)) {
return 0;
} else {
return -1;
}
}
int rocksdb_property_int_cf(
rocksdb_t* db,
rocksdb_column_family_handle_t* column_family,
const char* propname,
uint64_t *out_val) {
if (db->rep->GetIntProperty(column_family->rep, Slice(propname), out_val)) {
return 0;
} else {
return -1;
}
}
char* rocksdb_property_value_cf(
rocksdb_t* db,
rocksdb_column_family_handle_t* column_family,
const char* propname) {
std::string tmp;
if (db->rep->GetProperty(column_family->rep, Slice(propname), &tmp)) {
// We use strdup() since we expect human readable output.
return strdup(tmp.c_str());
} else {
return nullptr;
}
}
void rocksdb_approximate_sizes(rocksdb_t* db, int num_ranges,
const char* const* range_start_key,
const size_t* range_start_key_len,
const char* const* range_limit_key,
const size_t* range_limit_key_len,
uint64_t* sizes, char** errptr) {
Range* ranges = new Range[num_ranges];
for (int i = 0; i < num_ranges; i++) {
ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
}
Status s = db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
if (!s.ok()) {
SaveError(errptr, s);
}
delete[] ranges;
}
void rocksdb_approximate_sizes_cf(
rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
int num_ranges, const char* const* range_start_key,
const size_t* range_start_key_len, const char* const* range_limit_key,
const size_t* range_limit_key_len, uint64_t* sizes, char** errptr) {
Range* ranges = new Range[num_ranges];
for (int i = 0; i < num_ranges; i++) {
ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
}
Status s = db->rep->GetApproximateSizes(column_family->rep, ranges,
num_ranges, sizes);
if (!s.ok()) {
SaveError(errptr, s);
}
delete[] ranges;
}
void rocksdb_delete_file(
rocksdb_t* db,
const char* name) {
db->rep->DeleteFile(name);
}
const rocksdb_livefiles_t* rocksdb_livefiles(
rocksdb_t* db) {
rocksdb_livefiles_t* result = new rocksdb_livefiles_t;
db->rep->GetLiveFilesMetaData(&result->rep);
return result;
}
void rocksdb_compact_range(
rocksdb_t* db,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len) {
Slice a, b;
db->rep->CompactRange(
CompactRangeOptions(),
// Pass nullptr Slice if corresponding "const char*" is nullptr
(start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
(limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
}
void rocksdb_compact_range_cf(
rocksdb_t* db,
rocksdb_column_family_handle_t* column_family,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len) {
Slice a, b;
db->rep->CompactRange(
CompactRangeOptions(), column_family->rep,
// Pass nullptr Slice if corresponding "const char*" is nullptr
(start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
(limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
}
void rocksdb_compact_range_opt(rocksdb_t* db, rocksdb_compactoptions_t* opt,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len) {
Slice a, b;
db->rep->CompactRange(
opt->rep,
// Pass nullptr Slice if corresponding "const char*" is nullptr
(start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
(limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
}
void rocksdb_compact_range_cf_opt(rocksdb_t* db,
rocksdb_column_family_handle_t* column_family,
rocksdb_compactoptions_t* opt,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len) {
Slice a, b;
db->rep->CompactRange(
opt->rep, column_family->rep,
// Pass nullptr Slice if corresponding "const char*" is nullptr
(start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
(limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
}
void rocksdb_flush(
rocksdb_t* db,
const rocksdb_flushoptions_t* options,
char** errptr) {
SaveError(errptr, db->rep->Flush(options->rep));
}
void rocksdb_flush_cf(
rocksdb_t* db,
const rocksdb_flushoptions_t* options,
rocksdb_column_family_handle_t* column_family,
char** errptr) {
SaveError(errptr, db->rep->Flush(options->rep, column_family->rep));
}
void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) {
SaveError(errptr, db->rep->FlushWAL(sync));
}
void rocksdb_disable_file_deletions(
rocksdb_t* db,
char** errptr) {
SaveError(errptr, db->rep->DisableFileDeletions());
}
void rocksdb_enable_file_deletions(
rocksdb_t* db,
unsigned char force,
char** errptr) {
SaveError(errptr, db->rep->EnableFileDeletions(force));
}
void rocksdb_destroy_db(
const rocksdb_options_t* options,
const char* name,
char** errptr) {
SaveError(errptr, DestroyDB(name, options->rep));
}
void rocksdb_repair_db(
const rocksdb_options_t* options,
const char* name,
char** errptr) {
SaveError(errptr, RepairDB(name, options->rep));
}
void rocksdb_iter_destroy(rocksdb_iterator_t* iter) {
delete iter->rep;
delete iter;
}
unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) {
return iter->rep->Valid();
}
void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) {
iter->rep->SeekToFirst();
}
void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) {
iter->rep->SeekToLast();
}
void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) {
iter->rep->Seek(Slice(k, klen));
}
void rocksdb_iter_seek_for_prev(rocksdb_iterator_t* iter, const char* k,
size_t klen) {
iter->rep->SeekForPrev(Slice(k, klen));
}
void rocksdb_iter_next(rocksdb_iterator_t* iter) {
iter->rep->Next();
}
void rocksdb_iter_prev(rocksdb_iterator_t* iter) {
iter->rep->Prev();
}
const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) {
Slice s = iter->rep->key();
*klen = s.size();
return s.data();
}
const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) {
Slice s = iter->rep->value();
*vlen = s.size();
return s.data();
}
const char* rocksdb_iter_timestamp(const rocksdb_iterator_t* iter,
size_t* tslen) {
Slice s = iter->rep->timestamp();
*tslen = s.size();
return s.data();
}
void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) {
SaveError(errptr, iter->rep->status());
}
rocksdb_writebatch_t* rocksdb_writebatch_create() {
return new rocksdb_writebatch_t;
}
rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep,
size_t size) {
rocksdb_writebatch_t* b = new rocksdb_writebatch_t;
b->rep = WriteBatch(std::string(rep, size));
return b;
}
void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) { delete b; }
void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) { b->rep.Clear(); }
int rocksdb_writebatch_count(rocksdb_writebatch_t* b) { return b->rep.Count(); }
void rocksdb_writebatch_put(rocksdb_writebatch_t* b, const char* key,
size_t klen, const char* val, size_t vlen) {
b->rep.Put(Slice(key, klen), Slice(val, vlen));
}
void rocksdb_writebatch_put_cf(rocksdb_writebatch_t* b,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen, const char* val,
size_t vlen) {
b->rep.Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
}
void rocksdb_writebatch_put_cf_with_ts(
rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen, const char* ts, size_t tslen, const char* val,
size_t vlen) {
b->rep.Put(column_family->rep, Slice(key, klen), Slice(ts, tslen),
Slice(val, vlen));
}
void rocksdb_writebatch_putv(rocksdb_writebatch_t* b, int num_keys,
const char* const* keys_list,
const size_t* keys_list_sizes, int num_values,
const char* const* values_list,
const size_t* values_list_sizes) {
std::vector<Slice> key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<Slice> value_slices(num_values);
for (int i = 0; i < num_values; i++) {
value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
}
b->rep.Put(SliceParts(key_slices.data(), num_keys),
SliceParts(value_slices.data(), num_values));
}
void rocksdb_writebatch_putv_cf(rocksdb_writebatch_t* b,
rocksdb_column_family_handle_t* column_family,
int num_keys, const char* const* keys_list,
const size_t* keys_list_sizes, int num_values,
const char* const* values_list,
const size_t* values_list_sizes) {
std::vector<Slice> key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<Slice> value_slices(num_values);
for (int i = 0; i < num_values; i++) {
value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
}
b->rep.Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
SliceParts(value_slices.data(), num_values));
}
void rocksdb_writebatch_merge(rocksdb_writebatch_t* b, const char* key,
size_t klen, const char* val, size_t vlen) {
b->rep.Merge(Slice(key, klen), Slice(val, vlen));
}
void rocksdb_writebatch_merge_cf(
rocksdb_writebatch_t* b,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen,
const char* val, size_t vlen) {
b->rep.Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
}
void rocksdb_writebatch_mergev(
rocksdb_writebatch_t* b,
int num_keys, const char* const* keys_list,
const size_t* keys_list_sizes,
int num_values, const char* const* values_list,
const size_t* values_list_sizes) {
std::vector<Slice> key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<Slice> value_slices(num_values);
for (int i = 0; i < num_values; i++) {
value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
}
b->rep.Merge(SliceParts(key_slices.data(), num_keys),
SliceParts(value_slices.data(), num_values));
}
void rocksdb_writebatch_mergev_cf(
rocksdb_writebatch_t* b,
rocksdb_column_family_handle_t* column_family,
int num_keys, const char* const* keys_list,
const size_t* keys_list_sizes,
int num_values, const char* const* values_list,
const size_t* values_list_sizes) {
std::vector<Slice> key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<Slice> value_slices(num_values);
for (int i = 0; i < num_values; i++) {
value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
}
b->rep.Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
SliceParts(value_slices.data(), num_values));
}
void rocksdb_writebatch_delete(
rocksdb_writebatch_t* b,
const char* key, size_t klen) {
b->rep.Delete(Slice(key, klen));
}
void rocksdb_writebatch_singledelete(rocksdb_writebatch_t* b, const char* key,
size_t klen) {
b->rep.SingleDelete(Slice(key, klen));
}
void rocksdb_writebatch_delete_cf(
rocksdb_writebatch_t* b,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen) {
b->rep.Delete(column_family->rep, Slice(key, klen));
}
void rocksdb_writebatch_delete_cf_with_ts(
rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen, const char* ts, size_t tslen) {
b->rep.Delete(column_family->rep, Slice(key, klen), Slice(ts, tslen));
}
void rocksdb_writebatch_singledelete_cf(
rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen) {
b->rep.SingleDelete(column_family->rep, Slice(key, klen));
}
void rocksdb_writebatch_singledelete_cf_with_ts(
rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen, const char* ts, size_t tslen) {
b->rep.SingleDelete(column_family->rep, Slice(key, klen), Slice(ts, tslen));
}
void rocksdb_writebatch_deletev(rocksdb_writebatch_t* b, int num_keys,
const char* const* keys_list,
const size_t* keys_list_sizes) {
std::vector<Slice> key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
b->rep.Delete(SliceParts(key_slices.data(), num_keys));
}
void rocksdb_writebatch_deletev_cf(
rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) {
std::vector<Slice> key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
b->rep.Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
}
void rocksdb_writebatch_delete_range(rocksdb_writebatch_t* b,
const char* start_key,
size_t start_key_len, const char* end_key,
size_t end_key_len) {
b->rep.DeleteRange(Slice(start_key, start_key_len),
Slice(end_key, end_key_len));
}
void rocksdb_writebatch_delete_range_cf(
rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
const char* start_key, size_t start_key_len, const char* end_key,
size_t end_key_len) {
b->rep.DeleteRange(column_family->rep, Slice(start_key, start_key_len),
Slice(end_key, end_key_len));
}
void rocksdb_writebatch_delete_rangev(rocksdb_writebatch_t* b, int num_keys,
const char* const* start_keys_list,
const size_t* start_keys_list_sizes,
const char* const* end_keys_list,
const size_t* end_keys_list_sizes) {
std::vector<Slice> start_key_slices(num_keys);
std::vector<Slice> end_key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
}
b->rep.DeleteRange(SliceParts(start_key_slices.data(), num_keys),
SliceParts(end_key_slices.data(), num_keys));
}
void rocksdb_writebatch_delete_rangev_cf(
rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
int num_keys, const char* const* start_keys_list,
const size_t* start_keys_list_sizes, const char* const* end_keys_list,
const size_t* end_keys_list_sizes) {
std::vector<Slice> start_key_slices(num_keys);
std::vector<Slice> end_key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
}
b->rep.DeleteRange(column_family->rep,
SliceParts(start_key_slices.data(), num_keys),
SliceParts(end_key_slices.data(), num_keys));
}
void rocksdb_writebatch_put_log_data(
rocksdb_writebatch_t* b,
const char* blob, size_t len) {
b->rep.PutLogData(Slice(blob, len));
}
class H : public WriteBatch::Handler {
public:
void* state_;
void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
void (*deleted_)(void*, const char* k, size_t klen);
void Put(const Slice& key, const Slice& value) override {
(*put_)(state_, key.data(), key.size(), value.data(), value.size());
}
void Delete(const Slice& key) override {
(*deleted_)(state_, key.data(), key.size());
}
};
void rocksdb_writebatch_iterate(
rocksdb_writebatch_t* b,
void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
void (*deleted)(void*, const char* k, size_t klen)) {
H handler;
handler.state_ = state;
handler.put_ = put;
handler.deleted_ = deleted;
b->rep.Iterate(&handler);
}
const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) {
*size = b->rep.GetDataSize();
return b->rep.Data().c_str();
}
void rocksdb_writebatch_set_save_point(rocksdb_writebatch_t* b) {
b->rep.SetSavePoint();
}
void rocksdb_writebatch_rollback_to_save_point(rocksdb_writebatch_t* b,
char** errptr) {
SaveError(errptr, b->rep.RollbackToSavePoint());
}
void rocksdb_writebatch_pop_save_point(rocksdb_writebatch_t* b, char** errptr) {
SaveError(errptr, b->rep.PopSavePoint());
}
rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create(size_t reserved_bytes, unsigned char overwrite_key) {
rocksdb_writebatch_wi_t* b = new rocksdb_writebatch_wi_t;
b->rep = new WriteBatchWithIndex(BytewiseComparator(), reserved_bytes, overwrite_key);
return b;
}
void rocksdb_writebatch_wi_destroy(rocksdb_writebatch_wi_t* b) {
if (b->rep) {
delete b->rep;
}
delete b;
}
void rocksdb_writebatch_wi_clear(rocksdb_writebatch_wi_t* b) {
b->rep->Clear();
}
int rocksdb_writebatch_wi_count(rocksdb_writebatch_wi_t* b) {
return b->rep->GetWriteBatch()->Count();
}
void rocksdb_writebatch_wi_put(
rocksdb_writebatch_wi_t* b,
const char* key, size_t klen,
const char* val, size_t vlen) {
b->rep->Put(Slice(key, klen), Slice(val, vlen));
}
void rocksdb_writebatch_wi_put_cf(
rocksdb_writebatch_wi_t* b,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen,
const char* val, size_t vlen) {
b->rep->Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
}
void rocksdb_writebatch_wi_putv(
rocksdb_writebatch_wi_t* b,
int num_keys, const char* const* keys_list,
const size_t* keys_list_sizes,
int num_values, const char* const* values_list,
const size_t* values_list_sizes) {
std::vector<Slice> key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<Slice> value_slices(num_values);
for (int i = 0; i < num_values; i++) {
value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
}
b->rep->Put(SliceParts(key_slices.data(), num_keys),
SliceParts(value_slices.data(), num_values));
}
void rocksdb_writebatch_wi_putv_cf(
rocksdb_writebatch_wi_t* b,
rocksdb_column_family_handle_t* column_family,
int num_keys, const char* const* keys_list,
const size_t* keys_list_sizes,
int num_values, const char* const* values_list,
const size_t* values_list_sizes) {
std::vector<Slice> key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<Slice> value_slices(num_values);
for (int i = 0; i < num_values; i++) {
value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
}
b->rep->Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
SliceParts(value_slices.data(), num_values));
}
void rocksdb_writebatch_wi_merge(
rocksdb_writebatch_wi_t* b,
const char* key, size_t klen,
const char* val, size_t vlen) {
b->rep->Merge(Slice(key, klen), Slice(val, vlen));
}
void rocksdb_writebatch_wi_merge_cf(
rocksdb_writebatch_wi_t* b,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen,
const char* val, size_t vlen) {
b->rep->Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
}
void rocksdb_writebatch_wi_mergev(
rocksdb_writebatch_wi_t* b,
int num_keys, const char* const* keys_list,
const size_t* keys_list_sizes,
int num_values, const char* const* values_list,
const size_t* values_list_sizes) {
std::vector<Slice> key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<Slice> value_slices(num_values);
for (int i = 0; i < num_values; i++) {
value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
}
b->rep->Merge(SliceParts(key_slices.data(), num_keys),
SliceParts(value_slices.data(), num_values));
}
void rocksdb_writebatch_wi_mergev_cf(
rocksdb_writebatch_wi_t* b,
rocksdb_column_family_handle_t* column_family,
int num_keys, const char* const* keys_list,
const size_t* keys_list_sizes,
int num_values, const char* const* values_list,
const size_t* values_list_sizes) {
std::vector<Slice> key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<Slice> value_slices(num_values);
for (int i = 0; i < num_values; i++) {
value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
}
b->rep->Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
SliceParts(value_slices.data(), num_values));
}
void rocksdb_writebatch_wi_delete(
rocksdb_writebatch_wi_t* b,
const char* key, size_t klen) {
b->rep->Delete(Slice(key, klen));
}
void rocksdb_writebatch_wi_singledelete(rocksdb_writebatch_wi_t* b,
const char* key, size_t klen) {
b->rep->SingleDelete(Slice(key, klen));
}
void rocksdb_writebatch_wi_delete_cf(
rocksdb_writebatch_wi_t* b,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen) {
b->rep->Delete(column_family->rep, Slice(key, klen));
}
void rocksdb_writebatch_wi_singledelete_cf(
rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen) {
b->rep->SingleDelete(column_family->rep, Slice(key, klen));
}
void rocksdb_writebatch_wi_deletev(
rocksdb_writebatch_wi_t* b,
int num_keys, const char* const* keys_list,
const size_t* keys_list_sizes) {
std::vector<Slice> key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
b->rep->Delete(SliceParts(key_slices.data(), num_keys));
}
void rocksdb_writebatch_wi_deletev_cf(
rocksdb_writebatch_wi_t* b,
rocksdb_column_family_handle_t* column_family,
int num_keys, const char* const* keys_list,
const size_t* keys_list_sizes) {
std::vector<Slice> key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
b->rep->Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
}
void rocksdb_writebatch_wi_delete_range(rocksdb_writebatch_wi_t* b,
const char* start_key,
size_t start_key_len, const char* end_key,
size_t end_key_len) {
b->rep->DeleteRange(Slice(start_key, start_key_len),
Slice(end_key, end_key_len));
}
void rocksdb_writebatch_wi_delete_range_cf(
rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
const char* start_key, size_t start_key_len, const char* end_key,
size_t end_key_len) {
b->rep->DeleteRange(column_family->rep, Slice(start_key, start_key_len),
Slice(end_key, end_key_len));
}
void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b, int num_keys,
const char* const* start_keys_list,
const size_t* start_keys_list_sizes,
const char* const* end_keys_list,
const size_t* end_keys_list_sizes) {
std::vector<Slice> start_key_slices(num_keys);
std::vector<Slice> end_key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
}
b->rep->DeleteRange(SliceParts(start_key_slices.data(), num_keys),
SliceParts(end_key_slices.data(), num_keys));
}
void rocksdb_writebatch_wi_delete_rangev_cf(
rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
int num_keys, const char* const* start_keys_list,
const size_t* start_keys_list_sizes, const char* const* end_keys_list,
const size_t* end_keys_list_sizes) {
std::vector<Slice> start_key_slices(num_keys);
std::vector<Slice> end_key_slices(num_keys);
for (int i = 0; i < num_keys; i++) {
start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
}
b->rep->DeleteRange(column_family->rep,
SliceParts(start_key_slices.data(), num_keys),
SliceParts(end_key_slices.data(), num_keys));
}
void rocksdb_writebatch_wi_put_log_data(
rocksdb_writebatch_wi_t* b,
const char* blob, size_t len) {
b->rep->PutLogData(Slice(blob, len));
}
void rocksdb_writebatch_wi_iterate(
rocksdb_writebatch_wi_t* b,
void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
void (*deleted)(void*, const char* k, size_t klen)) {
H handler;
handler.state_ = state;
handler.put_ = put;
handler.deleted_ = deleted;
b->rep->GetWriteBatch()->Iterate(&handler);
}
const char* rocksdb_writebatch_wi_data(rocksdb_writebatch_wi_t* b, size_t* size) {
WriteBatch* wb = b->rep->GetWriteBatch();
*size = wb->GetDataSize();
return wb->Data().c_str();
}
void rocksdb_writebatch_wi_set_save_point(rocksdb_writebatch_wi_t* b) {
b->rep->SetSavePoint();
}
void rocksdb_writebatch_wi_rollback_to_save_point(rocksdb_writebatch_wi_t* b,
char** errptr) {
SaveError(errptr, b->rep->RollbackToSavePoint());
}
rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base(
rocksdb_writebatch_wi_t* wbwi,
rocksdb_iterator_t* base_iterator) {
rocksdb_iterator_t* result = new rocksdb_iterator_t;
result->rep = wbwi->rep->NewIteratorWithBase(base_iterator->rep);
delete base_iterator;
return result;
}
rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf(
rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
rocksdb_column_family_handle_t* column_family) {
rocksdb_iterator_t* result = new rocksdb_iterator_t;
result->rep =
wbwi->rep->NewIteratorWithBase(column_family->rep, base_iterator->rep);
delete base_iterator;
return result;
}
char* rocksdb_writebatch_wi_get_from_batch(
rocksdb_writebatch_wi_t* wbwi,
const rocksdb_options_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr) {
char* result = nullptr;
std::string tmp;
Status s = wbwi->rep->GetFromBatch(options->rep, Slice(key, keylen), &tmp);
if (s.ok()) {
*vallen = tmp.size();
result = CopyString(tmp);
} else {
*vallen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
char* rocksdb_writebatch_wi_get_from_batch_cf(
rocksdb_writebatch_wi_t* wbwi,
const rocksdb_options_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen,
size_t* vallen,
char** errptr) {
char* result = nullptr;
std::string tmp;
Status s = wbwi->rep->GetFromBatch(column_family->rep, options->rep,
Slice(key, keylen), &tmp);
if (s.ok()) {
*vallen = tmp.size();
result = CopyString(tmp);
} else {
*vallen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
char* rocksdb_writebatch_wi_get_from_batch_and_db(
rocksdb_writebatch_wi_t* wbwi,
rocksdb_t* db,
const rocksdb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr) {
char* result = nullptr;
std::string tmp;
Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, Slice(key, keylen), &tmp);
if (s.ok()) {
*vallen = tmp.size();
result = CopyString(tmp);
} else {
*vallen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
rocksdb_writebatch_wi_t* wbwi,
rocksdb_t* db,
const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen,
size_t* vallen,
char** errptr) {
char* result = nullptr;
std::string tmp;
Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, column_family->rep,
Slice(key, keylen), &tmp);
if (s.ok()) {
*vallen = tmp.size();
result = CopyString(tmp);
} else {
*vallen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
void rocksdb_write_writebatch_wi(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_writebatch_wi_t* wbwi,
char** errptr) {
WriteBatch* wb = wbwi->rep->GetWriteBatch();
SaveError(errptr, db->rep->Write(options->rep, wb));
}
rocksdb_block_based_table_options_t*
rocksdb_block_based_options_create() {
return new rocksdb_block_based_table_options_t;
}
void rocksdb_block_based_options_destroy(
rocksdb_block_based_table_options_t* options) {
delete options;
}
void rocksdb_block_based_options_set_block_size(
rocksdb_block_based_table_options_t* options, size_t block_size) {
options->rep.block_size = block_size;
}
void rocksdb_block_based_options_set_block_size_deviation(
rocksdb_block_based_table_options_t* options, int block_size_deviation) {
options->rep.block_size_deviation = block_size_deviation;
}
void rocksdb_block_based_options_set_block_restart_interval(
rocksdb_block_based_table_options_t* options, int block_restart_interval) {
options->rep.block_restart_interval = block_restart_interval;
}
void rocksdb_block_based_options_set_index_block_restart_interval(
rocksdb_block_based_table_options_t* options, int index_block_restart_interval) {
options->rep.index_block_restart_interval = index_block_restart_interval;
}
void rocksdb_block_based_options_set_metadata_block_size(
rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size) {
options->rep.metadata_block_size = metadata_block_size;
}
void rocksdb_block_based_options_set_partition_filters(
rocksdb_block_based_table_options_t* options, unsigned char partition_filters) {
options->rep.partition_filters = partition_filters;
}
void rocksdb_block_based_options_set_use_delta_encoding(
rocksdb_block_based_table_options_t* options, unsigned char use_delta_encoding) {
options->rep.use_delta_encoding = use_delta_encoding;
}
void rocksdb_block_based_options_set_filter_policy(
rocksdb_block_based_table_options_t* options,
rocksdb_filterpolicy_t* filter_policy) {
options->rep.filter_policy.reset(filter_policy);
}
void rocksdb_block_based_options_set_no_block_cache(
rocksdb_block_based_table_options_t* options,
unsigned char no_block_cache) {
options->rep.no_block_cache = no_block_cache;
}
void rocksdb_block_based_options_set_block_cache(
rocksdb_block_based_table_options_t* options,
rocksdb_cache_t* block_cache) {
if (block_cache) {
options->rep.block_cache = block_cache->rep;
}
}
void rocksdb_block_based_options_set_block_cache_compressed(
rocksdb_block_based_table_options_t* options,
rocksdb_cache_t* block_cache_compressed) {
if (block_cache_compressed) {
options->rep.block_cache_compressed = block_cache_compressed->rep;
}
}
void rocksdb_block_based_options_set_whole_key_filtering(
rocksdb_block_based_table_options_t* options, unsigned char v) {
options->rep.whole_key_filtering = v;
}
void rocksdb_block_based_options_set_format_version(
rocksdb_block_based_table_options_t* options, int v) {
options->rep.format_version = v;
}
void rocksdb_block_based_options_set_index_type(
rocksdb_block_based_table_options_t* options, int v) {
options->rep.index_type = static_cast<BlockBasedTableOptions::IndexType>(v);
}
void rocksdb_block_based_options_set_data_block_index_type(
rocksdb_block_based_table_options_t* options, int v) {
options->rep.data_block_index_type =
static_cast<BlockBasedTableOptions::DataBlockIndexType>(v);
}
void rocksdb_block_based_options_set_data_block_hash_ratio(
rocksdb_block_based_table_options_t* options, double v) {
options->rep.data_block_hash_table_util_ratio = v;
}
void rocksdb_block_based_options_set_cache_index_and_filter_blocks(
rocksdb_block_based_table_options_t* options, unsigned char v) {
options->rep.cache_index_and_filter_blocks = v;
}
void rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority(
rocksdb_block_based_table_options_t* options, unsigned char v) {
options->rep.cache_index_and_filter_blocks_with_high_priority = v;
}
void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
rocksdb_block_based_table_options_t* options, unsigned char v) {
options->rep.pin_l0_filter_and_index_blocks_in_cache = v;
}
void rocksdb_block_based_options_set_pin_top_level_index_and_filter(
rocksdb_block_based_table_options_t* options, unsigned char v) {
options->rep.pin_top_level_index_and_filter = v;
}
void rocksdb_options_set_block_based_table_factory(
rocksdb_options_t *opt,
rocksdb_block_based_table_options_t* table_options) {
if (table_options) {
opt->rep.table_factory.reset(
ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options->rep));
}
}
rocksdb_cuckoo_table_options_t*
rocksdb_cuckoo_options_create() {
return new rocksdb_cuckoo_table_options_t;
}
void rocksdb_cuckoo_options_destroy(
rocksdb_cuckoo_table_options_t* options) {
delete options;
}
void rocksdb_cuckoo_options_set_hash_ratio(
rocksdb_cuckoo_table_options_t* options, double v) {
options->rep.hash_table_ratio = v;
}
void rocksdb_cuckoo_options_set_max_search_depth(
rocksdb_cuckoo_table_options_t* options, uint32_t v) {
options->rep.max_search_depth = v;
}
void rocksdb_cuckoo_options_set_cuckoo_block_size(
rocksdb_cuckoo_table_options_t* options, uint32_t v) {
options->rep.cuckoo_block_size = v;
}
void rocksdb_cuckoo_options_set_identity_as_first_hash(
rocksdb_cuckoo_table_options_t* options, unsigned char v) {
options->rep.identity_as_first_hash = v;
}
void rocksdb_cuckoo_options_set_use_module_hash(
rocksdb_cuckoo_table_options_t* options, unsigned char v) {
options->rep.use_module_hash = v;
}
void rocksdb_options_set_cuckoo_table_factory(
rocksdb_options_t *opt,
rocksdb_cuckoo_table_options_t* table_options) {
if (table_options) {
opt->rep.table_factory.reset(
ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options->rep));
}
}
void rocksdb_set_options(
rocksdb_t* db, int count, const char* const keys[], const char* const values[], char** errptr) {
std::unordered_map<std::string, std::string> options_map;
for (int i=0; i<count; i++)
options_map[keys[i]] = values[i];
SaveError(errptr,
db->rep->SetOptions(options_map));
}
void rocksdb_set_options_cf(
rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr) {
std::unordered_map<std::string, std::string> options_map;
for (int i=0; i<count; i++)
options_map[keys[i]] = values[i];
SaveError(errptr,
db->rep->SetOptions(handle->rep, options_map));
}
rocksdb_options_t* rocksdb_options_create() {
return new rocksdb_options_t;
}
void rocksdb_options_destroy(rocksdb_options_t* options) {
delete options;
}
rocksdb_options_t* rocksdb_options_create_copy(rocksdb_options_t* options) {
return new rocksdb_options_t(*options);
}
void rocksdb_options_increase_parallelism(
rocksdb_options_t* opt, int total_threads) {
opt->rep.IncreaseParallelism(total_threads);
}
void rocksdb_options_optimize_for_point_lookup(
rocksdb_options_t* opt, uint64_t block_cache_size_mb) {
opt->rep.OptimizeForPointLookup(block_cache_size_mb);
}
void rocksdb_options_optimize_level_style_compaction(
rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
opt->rep.OptimizeLevelStyleCompaction(memtable_memory_budget);
}
void rocksdb_options_optimize_universal_style_compaction(
rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
opt->rep.OptimizeUniversalStyleCompaction(memtable_memory_budget);
}
void rocksdb_options_set_allow_ingest_behind(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.allow_ingest_behind = v;
}
unsigned char rocksdb_options_get_allow_ingest_behind(rocksdb_options_t* opt) {
return opt->rep.allow_ingest_behind;
}
void rocksdb_options_set_compaction_filter(
rocksdb_options_t* opt,
rocksdb_compactionfilter_t* filter) {
opt->rep.compaction_filter = filter;
}
void rocksdb_options_set_compaction_filter_factory(
rocksdb_options_t* opt, rocksdb_compactionfilterfactory_t* factory) {
opt->rep.compaction_filter_factory =
std::shared_ptr<CompactionFilterFactory>(factory);
}
void rocksdb_options_compaction_readahead_size(
rocksdb_options_t* opt, size_t s) {
opt->rep.compaction_readahead_size = s;
}
size_t rocksdb_options_get_compaction_readahead_size(rocksdb_options_t* opt) {
return opt->rep.compaction_readahead_size;
}
void rocksdb_options_set_comparator(
rocksdb_options_t* opt,
rocksdb_comparator_t* cmp) {
opt->rep.comparator = cmp;
}
11 years ago
void rocksdb_options_set_merge_operator(
rocksdb_options_t* opt,
rocksdb_mergeoperator_t* merge_operator) {
opt->rep.merge_operator = std::shared_ptr<MergeOperator>(merge_operator);
}
void rocksdb_options_set_create_if_missing(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.create_if_missing = v;
}
unsigned char rocksdb_options_get_create_if_missing(rocksdb_options_t* opt) {
return opt->rep.create_if_missing;
}
void rocksdb_options_set_create_missing_column_families(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.create_missing_column_families = v;
}
unsigned char rocksdb_options_get_create_missing_column_families(
rocksdb_options_t* opt) {
return opt->rep.create_missing_column_families;
}
void rocksdb_options_set_error_if_exists(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.error_if_exists = v;
}
unsigned char rocksdb_options_get_error_if_exists(rocksdb_options_t* opt) {
return opt->rep.error_if_exists;
}
void rocksdb_options_set_paranoid_checks(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.paranoid_checks = v;
}
unsigned char rocksdb_options_get_paranoid_checks(rocksdb_options_t* opt) {
return opt->rep.paranoid_checks;
}
void rocksdb_options_set_db_paths(rocksdb_options_t* opt,
const rocksdb_dbpath_t** dbpath_values,
size_t num_paths) {
std::vector<DbPath> db_paths(num_paths);
for (size_t i = 0; i < num_paths; ++i) {
db_paths[i] = dbpath_values[i]->rep;
}
opt->rep.db_paths = db_paths;
}
void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) {
opt->rep.env = (env ? env->rep : nullptr);
}
void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) {
if (l) {
opt->rep.info_log = l->rep;
}
}
void rocksdb_options_set_info_log_level(
rocksdb_options_t* opt, int v) {
opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
}
int rocksdb_options_get_info_log_level(rocksdb_options_t* opt) {
return static_cast<int>(opt->rep.info_log_level);
}
void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt,
size_t s) {
opt->rep.db_write_buffer_size = s;
}
size_t rocksdb_options_get_db_write_buffer_size(rocksdb_options_t* opt) {
return opt->rep.db_write_buffer_size;
}
void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
opt->rep.write_buffer_size = s;
}
size_t rocksdb_options_get_write_buffer_size(rocksdb_options_t* opt) {
return opt->rep.write_buffer_size;
}
void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
opt->rep.max_open_files = n;
}
int rocksdb_options_get_max_open_files(rocksdb_options_t* opt) {
return opt->rep.max_open_files;
}
void rocksdb_options_set_max_file_opening_threads(rocksdb_options_t* opt, int n) {
opt->rep.max_file_opening_threads = n;
}
int rocksdb_options_get_max_file_opening_threads(rocksdb_options_t* opt) {
return opt->rep.max_file_opening_threads;
}
void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n) {
opt->rep.max_total_wal_size = n;
}
uint64_t rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt) {
return opt->rep.max_total_wal_size;
}
void rocksdb_options_set_target_file_size_base(
rocksdb_options_t* opt, uint64_t n) {
opt->rep.target_file_size_base = n;
}
uint64_t rocksdb_options_get_target_file_size_base(rocksdb_options_t* opt) {
return opt->rep.target_file_size_base;
}
void rocksdb_options_set_target_file_size_multiplier(
rocksdb_options_t* opt, int n) {
opt->rep.target_file_size_multiplier = n;
}
int rocksdb_options_get_target_file_size_multiplier(rocksdb_options_t* opt) {
return opt->rep.target_file_size_multiplier;
}
void rocksdb_options_set_max_bytes_for_level_base(
rocksdb_options_t* opt, uint64_t n) {
opt->rep.max_bytes_for_level_base = n;
}
uint64_t rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t* opt) {
return opt->rep.max_bytes_for_level_base;
}
void rocksdb_options_set_level_compaction_dynamic_level_bytes(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.level_compaction_dynamic_level_bytes = v;
}
unsigned char rocksdb_options_get_level_compaction_dynamic_level_bytes(
rocksdb_options_t* opt) {
return opt->rep.level_compaction_dynamic_level_bytes;
}
void rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t* opt,
double n) {
opt->rep.max_bytes_for_level_multiplier = n;
}
double rocksdb_options_get_max_bytes_for_level_multiplier(
rocksdb_options_t* opt) {
return opt->rep.max_bytes_for_level_multiplier;
}
void rocksdb_options_set_max_compaction_bytes(rocksdb_options_t* opt,
uint64_t n) {
opt->rep.max_compaction_bytes = n;
}
uint64_t rocksdb_options_get_max_compaction_bytes(rocksdb_options_t* opt) {
return opt->rep.max_compaction_bytes;
}
void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
rocksdb_options_t* opt, int* level_values, size_t num_levels) {
opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels);
for (size_t i = 0; i < num_levels; ++i) {
opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i];
}
}
void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
}
void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
unsigned char val) {
opt->rep.skip_stats_update_on_db_open = val;
}
unsigned char rocksdb_options_get_skip_stats_update_on_db_open(
rocksdb_options_t* opt) {
return opt->rep.skip_stats_update_on_db_open;
}
void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
rocksdb_options_t* opt, unsigned char val) {
opt->rep.skip_checking_sst_file_sizes_on_db_open = val;
}
unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
rocksdb_options_t* opt) {
return opt->rep.skip_checking_sst_file_sizes_on_db_open;
}
/* Blob Options Settings */
void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt,
unsigned char val) {
opt->rep.enable_blob_files = val;
}
extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files(
rocksdb_options_t* opt) {
return opt->rep.enable_blob_files;
}
void rocksdb_options_set_min_blob_size(rocksdb_options_t* opt, uint64_t val) {
opt->rep.min_blob_size = val;
}
uint64_t rocksdb_options_get_min_blob_size(rocksdb_options_t* opt) {
return opt->rep.min_blob_size;
}
void rocksdb_options_set_blob_file_size(rocksdb_options_t* opt, uint64_t val) {
opt->rep.blob_file_size = val;
}
uint64_t rocksdb_options_get_blob_file_size(rocksdb_options_t* opt) {
return opt->rep.blob_file_size;
}
void rocksdb_options_set_blob_compression_type(rocksdb_options_t* opt,
int val) {
opt->rep.blob_compression_type = static_cast<CompressionType>(val);
}
int rocksdb_options_get_blob_compression_type(rocksdb_options_t* opt) {
return opt->rep.blob_compression_type;
}
void rocksdb_options_set_enable_blob_gc(rocksdb_options_t* opt,
unsigned char val) {
opt->rep.enable_blob_garbage_collection = val;
}
unsigned char rocksdb_options_get_enable_blob_gc(rocksdb_options_t* opt) {
return opt->rep.enable_blob_garbage_collection;
}
void rocksdb_options_set_blob_gc_age_cutoff(rocksdb_options_t* opt,
double val) {
opt->rep.blob_garbage_collection_age_cutoff = val;
}
double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) {
return opt->rep.blob_garbage_collection_age_cutoff;
}
Make it possible to force the garbage collection of the oldest blob files (#8994) Summary: The current BlobDB garbage collection logic works by relocating the valid blobs from the oldest blob files as they are encountered during compaction, and cleaning up blob files once they contain nothing but garbage. However, with sufficiently skewed workloads, it is theoretically possible to end up in a situation when few or no compactions get scheduled for the SST files that contain references to the oldest blob files, which can lead to increased space amp due to the lack of GC. In order to efficiently handle such workloads, the patch adds a new BlobDB configuration option called `blob_garbage_collection_force_threshold`, which signals to BlobDB to schedule targeted compactions for the SST files that keep alive the oldest batch of blob files if the overall ratio of garbage in the given blob files meets the threshold *and* all the given blob files are eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example, if the new option is set to 0.9, targeted compactions will get scheduled if the sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the oldest blob files, assuming all affected blob files are below the age-based cutoff.) The net result of these targeted compactions is that the valid blobs in the oldest blob files are relocated and the oldest blob files themselves cleaned up (since *all* SST files that rely on them get compacted away). These targeted compactions are similar to periodic compactions in the sense that they force certain SST files that otherwise would not get picked up to undergo compaction and also in the sense that instead of merging files from multiple levels, they target a single file. (Note: such compactions might still include neighboring files from the same level due to the need of having a "clean cut" boundary but they never include any files from any other level.) This functionality is currently only supported with the leveled compaction style and is inactive by default (since the default value is set to 1.0, i.e. 100%). Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994 Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests. Reviewed By: riversand963 Differential Revision: D31489850 Pulled By: ltamasi fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
3 years ago
void rocksdb_options_set_blob_gc_force_threshold(rocksdb_options_t* opt,
double val) {
opt->rep.blob_garbage_collection_force_threshold = val;
}
double rocksdb_options_get_blob_gc_force_threshold(rocksdb_options_t* opt) {
return opt->rep.blob_garbage_collection_force_threshold;
}
void rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt,
uint64_t val) {
opt->rep.blob_compaction_readahead_size = val;
}
uint64_t rocksdb_options_get_blob_compaction_readahead_size(
rocksdb_options_t* opt) {
return opt->rep.blob_compaction_readahead_size;
}
Make it possible to enable blob files starting from a certain LSM tree level (#10077) Summary: Currently, if blob files are enabled (i.e. `enable_blob_files` is true), large values are extracted both during flush/recovery (when SST files are written into level 0 of the LSM tree) and during compaction into any LSM tree level. For certain use cases that have a mix of short-lived and long-lived values, it might make sense to support extracting large values only during compactions whose output level is greater than or equal to a specified LSM tree level (e.g. compactions into L1/L2/... or above). This could reduce the space amplification caused by large values that are turned into garbage shortly after being written at the price of some write amplification incurred by long-lived values whose extraction to blob files is delayed. In order to achieve this, we would like to do the following: - Add a new configuration option `blob_file_starting_level` (default: 0) to `AdvancedColumnFamilyOptions` (and `MutableCFOptions` and extend the related logic) - Instantiate `BlobFileBuilder` in `BuildTable` (used during flush and recovery, where the LSM tree level is L0) and `CompactionJob` iff `enable_blob_files` is set and the LSM tree level is `>= blob_file_starting_level` - Add unit tests for the new functionality, and add the new option to our stress tests (`db_stress` and `db_crashtest.py` ) - Add the new option to our benchmarking tool `db_bench` and the BlobDB benchmark script `run_blob_bench.sh` - Add the new option to the `ldb` tool (see https://github.com/facebook/rocksdb/wiki/Administration-and-Data-Access-Tool) - Ideally extend the C and Java bindings with the new option - Update the BlobDB wiki to document the new option. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10077 Reviewed By: ltamasi Differential Revision: D36884156 Pulled By: gangliao fbshipit-source-id: 942bab025f04633edca8564ed64791cb5e31627d
3 years ago
void rocksdb_options_set_blob_file_starting_level(rocksdb_options_t* opt,
int val) {
opt->rep.blob_file_starting_level = val;
}
int rocksdb_options_get_blob_file_starting_level(rocksdb_options_t* opt) {
return opt->rep.blob_file_starting_level;
}
void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
opt->rep.num_levels = n;
}
int rocksdb_options_get_num_levels(rocksdb_options_t* opt) {
return opt->rep.num_levels;
}
void rocksdb_options_set_level0_file_num_compaction_trigger(
rocksdb_options_t* opt, int n) {
opt->rep.level0_file_num_compaction_trigger = n;
}
int rocksdb_options_get_level0_file_num_compaction_trigger(
rocksdb_options_t* opt) {
return opt->rep.level0_file_num_compaction_trigger;
}
void rocksdb_options_set_level0_slowdown_writes_trigger(
rocksdb_options_t* opt, int n) {
opt->rep.level0_slowdown_writes_trigger = n;
}
int rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t* opt) {
return opt->rep.level0_slowdown_writes_trigger;
}
void rocksdb_options_set_level0_stop_writes_trigger(
rocksdb_options_t* opt, int n) {
opt->rep.level0_stop_writes_trigger = n;
}
int rocksdb_options_get_level0_stop_writes_trigger(rocksdb_options_t* opt) {
return opt->rep.level0_stop_writes_trigger;
}
void rocksdb_options_set_wal_recovery_mode(rocksdb_options_t* opt,int mode) {
opt->rep.wal_recovery_mode = static_cast<WALRecoveryMode>(mode);
}
int rocksdb_options_get_wal_recovery_mode(rocksdb_options_t* opt) {
return static_cast<int>(opt->rep.wal_recovery_mode);
}
void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) {
opt->rep.compression = static_cast<CompressionType>(t);
}
int rocksdb_options_get_compression(rocksdb_options_t* opt) {
return opt->rep.compression;
}
void rocksdb_options_set_bottommost_compression(rocksdb_options_t* opt, int t) {
opt->rep.bottommost_compression = static_cast<CompressionType>(t);
}
int rocksdb_options_get_bottommost_compression(rocksdb_options_t* opt) {
return opt->rep.bottommost_compression;
}
void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt,
const int* level_values,
size_t num_levels) {
opt->rep.compression_per_level.resize(num_levels);
for (size_t i = 0; i < num_levels; ++i) {
opt->rep.compression_per_level[i] =
static_cast<CompressionType>(level_values[i]);
}
}
void rocksdb_options_set_bottommost_compression_options(rocksdb_options_t* opt,
int w_bits, int level,
int strategy,
int max_dict_bytes,
unsigned char enabled) {
opt->rep.bottommost_compression_opts.window_bits = w_bits;
opt->rep.bottommost_compression_opts.level = level;
opt->rep.bottommost_compression_opts.strategy = strategy;
opt->rep.bottommost_compression_opts.max_dict_bytes = max_dict_bytes;
opt->rep.bottommost_compression_opts.enabled = enabled;
}
void rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes(
rocksdb_options_t* opt, int zstd_max_train_bytes, unsigned char enabled) {
opt->rep.bottommost_compression_opts.zstd_max_train_bytes =
zstd_max_train_bytes;
opt->rep.bottommost_compression_opts.enabled = enabled;
}
Support using ZDICT_finalizeDictionary to generate zstd dictionary (#9857) Summary: An untrained dictionary is currently simply the concatenation of several samples. The ZSTD API, ZDICT_finalizeDictionary(), can improve such a dictionary's effectiveness at low cost. This PR changes how dictionary is created by calling the ZSTD ZDICT_finalizeDictionary() API instead of creating raw content dictionary (when max_dict_buffer_bytes > 0), and pass in all buffered uncompressed data blocks as samples. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9857 Test Plan: #### db_bench test for cpu/memory of compression+decompression and space saving on synthetic data: Set up: change the parameter [here](https://github.com/facebook/rocksdb/blob/fb9a167a55e0970b1ef6f67c1600c8d9c4c6114f/tools/db_bench_tool.cc#L1766) to 16384 to make synthetic data more compressible. ``` # linked local ZSTD with version 1.5.2 # DEBUG_LEVEL=0 ROCKSDB_NO_FBCODE=1 ROCKSDB_DISABLE_ZSTD=1 EXTRA_CXXFLAGS="-DZSTD_STATIC_LINKING_ONLY -DZSTD -I/data/users/changyubi/install/include/" EXTRA_LDFLAGS="-L/data/users/changyubi/install/lib/ -l:libzstd.a" make -j32 db_bench dict_bytes=16384 train_bytes=1048576 echo "========== No Dictionary ==========" TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=0 -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1 TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=0 -block_size=4096 2>&1 | grep elapsed du -hc /dev/shm/dbbench/*sst | grep total echo "========== Raw Content Dictionary ==========" TEST_TMPDIR=/dev/shm ./db_bench_main -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1 TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench_main -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -block_size=4096 2>&1 | grep elapsed du -hc /dev/shm/dbbench/*sst | grep total echo "========== FinalizeDictionary ==========" TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1 TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false -block_size=4096 2>&1 | grep elapsed du -hc /dev/shm/dbbench/*sst | grep total echo "========== TrainDictionary ==========" TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1 TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -block_size=4096 2>&1 | grep elapsed du -hc /dev/shm/dbbench/*sst | grep total # Result: TrainDictionary is much better on space saving, but FinalizeDictionary seems to use less memory. # before compression data size: 1.2GB dict_bytes=16384 max_dict_buffer_bytes = 1048576 space cpu/memory No Dictionary 468M 14.93user 1.00system 0:15.92elapsed 100%CPU (0avgtext+0avgdata 23904maxresident)k Raw Dictionary 251M 15.81user 0.80system 0:16.56elapsed 100%CPU (0avgtext+0avgdata 156808maxresident)k FinalizeDictionary 236M 11.93user 0.64system 0:12.56elapsed 100%CPU (0avgtext+0avgdata 89548maxresident)k TrainDictionary 84M 7.29user 0.45system 0:07.75elapsed 100%CPU (0avgtext+0avgdata 97288maxresident)k ``` #### Benchmark on 10 sample SST files for spacing saving and CPU time on compression: FinalizeDictionary is comparable to TrainDictionary in terms of space saving, and takes less time in compression. ``` dict_bytes=16384 train_bytes=1048576 for sst_file in `ls ../temp/myrock-sst/` do echo "********** $sst_file **********" echo "========== No Dictionary ==========" ./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD echo "========== Raw Content Dictionary ==========" ./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes echo "========== FinalizeDictionary ==========" ./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes --compression_zstd_max_train_bytes=$train_bytes --compression_use_zstd_finalize_dict echo "========== TrainDictionary ==========" ./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes --compression_zstd_max_train_bytes=$train_bytes done 010240.sst (Size/Time) 011029.sst 013184.sst 021552.sst 185054.sst 185137.sst 191666.sst 7560381.sst 7604174.sst 7635312.sst No Dictionary 28165569 / 2614419 32899411 / 2976832 32977848 / 3055542 31966329 / 2004590 33614351 / 1755877 33429029 / 1717042 33611933 / 1776936 33634045 / 2771417 33789721 / 2205414 33592194 / 388254 Raw Content Dictionary 28019950 / 2697961 33748665 / 3572422 33896373 / 3534701 26418431 / 2259658 28560825 / 1839168 28455030 / 1846039 28494319 / 1861349 32391599 / 3095649 33772142 / 2407843 33592230 / 474523 FinalizeDictionary 27896012 / 2650029 33763886 / 3719427 33904283 / 3552793 26008225 / 2198033 28111872 / 1869530 28014374 / 1789771 28047706 / 1848300 32296254 / 3204027 33698698 / 2381468 33592344 / 517433 TrainDictionary 28046089 / 2740037 33706480 / 3679019 33885741 / 3629351 25087123 / 2204558 27194353 / 1970207 27234229 / 1896811 27166710 / 1903119 32011041 / 3322315 32730692 / 2406146 33608631 / 570593 ``` #### Decompression/Read test: With FinalizeDictionary/TrainDictionary, some data structure used for decompression are in stored in dictionary, so they are expected to be faster in terms of decompression/reads. ``` dict_bytes=16384 train_bytes=1048576 echo "No Dictionary" TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=0 > /dev/null 2>&1 TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=0 2>&1 | grep MB/s echo "Raw Dictionary" TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes > /dev/null 2>&1 TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes 2>&1 | grep MB/s echo "FinalizeDict" TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false > /dev/null 2>&1 TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false 2>&1 | grep MB/s echo "Train Dictionary" TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes > /dev/null 2>&1 TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes 2>&1 | grep MB/s No Dictionary readrandom : 12.183 micros/op 82082 ops/sec 12.183 seconds 1000000 operations; 9.1 MB/s (1000000 of 1000000 found) Raw Dictionary readrandom : 12.314 micros/op 81205 ops/sec 12.314 seconds 1000000 operations; 9.0 MB/s (1000000 of 1000000 found) FinalizeDict readrandom : 9.787 micros/op 102180 ops/sec 9.787 seconds 1000000 operations; 11.3 MB/s (1000000 of 1000000 found) Train Dictionary readrandom : 9.698 micros/op 103108 ops/sec 9.699 seconds 1000000 operations; 11.4 MB/s (1000000 of 1000000 found) ``` Reviewed By: ajkr Differential Revision: D35720026 Pulled By: cbi42 fbshipit-source-id: 24d230fdff0fd28a1bb650658798f00dfcfb2a1f
3 years ago
void rocksdb_options_set_bottommost_compression_options_use_zstd_dict_trainer(
rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer,
unsigned char enabled) {
opt->rep.bottommost_compression_opts.use_zstd_dict_trainer =
use_zstd_dict_trainer;
opt->rep.bottommost_compression_opts.enabled = enabled;
}
unsigned char
rocksdb_options_get_bottommost_compression_options_use_zstd_dict_trainer(
rocksdb_options_t* opt) {
return opt->rep.bottommost_compression_opts.use_zstd_dict_trainer;
}
Limit buffering for collecting samples for compression dictionary (#7970) Summary: For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file. However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage. Related changes include: - Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks - Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary - Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970 Test Plan: - updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level - looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set. Reviewed By: pdillinger Differential Revision: D26467994 Pulled By: ajkr fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
4 years ago
void rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes(
rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes,
unsigned char enabled) {
opt->rep.bottommost_compression_opts.max_dict_buffer_bytes =
max_dict_buffer_bytes;
opt->rep.bottommost_compression_opts.enabled = enabled;
}
void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits,
int level, int strategy,
int max_dict_bytes) {
opt->rep.compression_opts.window_bits = w_bits;
opt->rep.compression_opts.level = level;
opt->rep.compression_opts.strategy = strategy;
opt->rep.compression_opts.max_dict_bytes = max_dict_bytes;
}
void rocksdb_options_set_compression_options_zstd_max_train_bytes(
rocksdb_options_t* opt, int zstd_max_train_bytes) {
opt->rep.compression_opts.zstd_max_train_bytes = zstd_max_train_bytes;
}
int rocksdb_options_get_compression_options_zstd_max_train_bytes(
rocksdb_options_t* opt) {
return opt->rep.compression_opts.zstd_max_train_bytes;
}
Support using ZDICT_finalizeDictionary to generate zstd dictionary (#9857) Summary: An untrained dictionary is currently simply the concatenation of several samples. The ZSTD API, ZDICT_finalizeDictionary(), can improve such a dictionary's effectiveness at low cost. This PR changes how dictionary is created by calling the ZSTD ZDICT_finalizeDictionary() API instead of creating raw content dictionary (when max_dict_buffer_bytes > 0), and pass in all buffered uncompressed data blocks as samples. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9857 Test Plan: #### db_bench test for cpu/memory of compression+decompression and space saving on synthetic data: Set up: change the parameter [here](https://github.com/facebook/rocksdb/blob/fb9a167a55e0970b1ef6f67c1600c8d9c4c6114f/tools/db_bench_tool.cc#L1766) to 16384 to make synthetic data more compressible. ``` # linked local ZSTD with version 1.5.2 # DEBUG_LEVEL=0 ROCKSDB_NO_FBCODE=1 ROCKSDB_DISABLE_ZSTD=1 EXTRA_CXXFLAGS="-DZSTD_STATIC_LINKING_ONLY -DZSTD -I/data/users/changyubi/install/include/" EXTRA_LDFLAGS="-L/data/users/changyubi/install/lib/ -l:libzstd.a" make -j32 db_bench dict_bytes=16384 train_bytes=1048576 echo "========== No Dictionary ==========" TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=0 -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1 TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=0 -block_size=4096 2>&1 | grep elapsed du -hc /dev/shm/dbbench/*sst | grep total echo "========== Raw Content Dictionary ==========" TEST_TMPDIR=/dev/shm ./db_bench_main -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1 TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench_main -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -block_size=4096 2>&1 | grep elapsed du -hc /dev/shm/dbbench/*sst | grep total echo "========== FinalizeDictionary ==========" TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1 TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false -block_size=4096 2>&1 | grep elapsed du -hc /dev/shm/dbbench/*sst | grep total echo "========== TrainDictionary ==========" TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1 TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -block_size=4096 2>&1 | grep elapsed du -hc /dev/shm/dbbench/*sst | grep total # Result: TrainDictionary is much better on space saving, but FinalizeDictionary seems to use less memory. # before compression data size: 1.2GB dict_bytes=16384 max_dict_buffer_bytes = 1048576 space cpu/memory No Dictionary 468M 14.93user 1.00system 0:15.92elapsed 100%CPU (0avgtext+0avgdata 23904maxresident)k Raw Dictionary 251M 15.81user 0.80system 0:16.56elapsed 100%CPU (0avgtext+0avgdata 156808maxresident)k FinalizeDictionary 236M 11.93user 0.64system 0:12.56elapsed 100%CPU (0avgtext+0avgdata 89548maxresident)k TrainDictionary 84M 7.29user 0.45system 0:07.75elapsed 100%CPU (0avgtext+0avgdata 97288maxresident)k ``` #### Benchmark on 10 sample SST files for spacing saving and CPU time on compression: FinalizeDictionary is comparable to TrainDictionary in terms of space saving, and takes less time in compression. ``` dict_bytes=16384 train_bytes=1048576 for sst_file in `ls ../temp/myrock-sst/` do echo "********** $sst_file **********" echo "========== No Dictionary ==========" ./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD echo "========== Raw Content Dictionary ==========" ./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes echo "========== FinalizeDictionary ==========" ./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes --compression_zstd_max_train_bytes=$train_bytes --compression_use_zstd_finalize_dict echo "========== TrainDictionary ==========" ./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes --compression_zstd_max_train_bytes=$train_bytes done 010240.sst (Size/Time) 011029.sst 013184.sst 021552.sst 185054.sst 185137.sst 191666.sst 7560381.sst 7604174.sst 7635312.sst No Dictionary 28165569 / 2614419 32899411 / 2976832 32977848 / 3055542 31966329 / 2004590 33614351 / 1755877 33429029 / 1717042 33611933 / 1776936 33634045 / 2771417 33789721 / 2205414 33592194 / 388254 Raw Content Dictionary 28019950 / 2697961 33748665 / 3572422 33896373 / 3534701 26418431 / 2259658 28560825 / 1839168 28455030 / 1846039 28494319 / 1861349 32391599 / 3095649 33772142 / 2407843 33592230 / 474523 FinalizeDictionary 27896012 / 2650029 33763886 / 3719427 33904283 / 3552793 26008225 / 2198033 28111872 / 1869530 28014374 / 1789771 28047706 / 1848300 32296254 / 3204027 33698698 / 2381468 33592344 / 517433 TrainDictionary 28046089 / 2740037 33706480 / 3679019 33885741 / 3629351 25087123 / 2204558 27194353 / 1970207 27234229 / 1896811 27166710 / 1903119 32011041 / 3322315 32730692 / 2406146 33608631 / 570593 ``` #### Decompression/Read test: With FinalizeDictionary/TrainDictionary, some data structure used for decompression are in stored in dictionary, so they are expected to be faster in terms of decompression/reads. ``` dict_bytes=16384 train_bytes=1048576 echo "No Dictionary" TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=0 > /dev/null 2>&1 TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=0 2>&1 | grep MB/s echo "Raw Dictionary" TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes > /dev/null 2>&1 TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes 2>&1 | grep MB/s echo "FinalizeDict" TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false > /dev/null 2>&1 TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false 2>&1 | grep MB/s echo "Train Dictionary" TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes > /dev/null 2>&1 TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes 2>&1 | grep MB/s No Dictionary readrandom : 12.183 micros/op 82082 ops/sec 12.183 seconds 1000000 operations; 9.1 MB/s (1000000 of 1000000 found) Raw Dictionary readrandom : 12.314 micros/op 81205 ops/sec 12.314 seconds 1000000 operations; 9.0 MB/s (1000000 of 1000000 found) FinalizeDict readrandom : 9.787 micros/op 102180 ops/sec 9.787 seconds 1000000 operations; 11.3 MB/s (1000000 of 1000000 found) Train Dictionary readrandom : 9.698 micros/op 103108 ops/sec 9.699 seconds 1000000 operations; 11.4 MB/s (1000000 of 1000000 found) ``` Reviewed By: ajkr Differential Revision: D35720026 Pulled By: cbi42 fbshipit-source-id: 24d230fdff0fd28a1bb650658798f00dfcfb2a1f
3 years ago
void rocksdb_options_set_compression_options_use_zstd_dict_trainer(
rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer) {
opt->rep.compression_opts.use_zstd_dict_trainer = use_zstd_dict_trainer;
}
unsigned char rocksdb_options_get_compression_options_use_zstd_dict_trainer(
rocksdb_options_t* opt) {
return opt->rep.compression_opts.use_zstd_dict_trainer;
}
void rocksdb_options_set_compression_options_parallel_threads(
rocksdb_options_t* opt, int value) {
opt->rep.compression_opts.parallel_threads = value;
}
int rocksdb_options_get_compression_options_parallel_threads(
rocksdb_options_t* opt) {
return opt->rep.compression_opts.parallel_threads;
}
Limit buffering for collecting samples for compression dictionary (#7970) Summary: For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file. However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage. Related changes include: - Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks - Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary - Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970 Test Plan: - updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level - looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set. Reviewed By: pdillinger Differential Revision: D26467994 Pulled By: ajkr fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
4 years ago
void rocksdb_options_set_compression_options_max_dict_buffer_bytes(
rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes) {
opt->rep.compression_opts.max_dict_buffer_bytes = max_dict_buffer_bytes;
}
uint64_t rocksdb_options_get_compression_options_max_dict_buffer_bytes(
rocksdb_options_t* opt) {
return opt->rep.compression_opts.max_dict_buffer_bytes;
}
void rocksdb_options_set_prefix_extractor(
rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) {
opt->rep.prefix_extractor.reset(prefix_extractor);
}
void rocksdb_options_set_use_fsync(
rocksdb_options_t* opt, int use_fsync) {
opt->rep.use_fsync = use_fsync;
}
int rocksdb_options_get_use_fsync(rocksdb_options_t* opt) {
return opt->rep.use_fsync;
}
void rocksdb_options_set_db_log_dir(
rocksdb_options_t* opt, const char* db_log_dir) {
opt->rep.db_log_dir = db_log_dir;
}
void rocksdb_options_set_wal_dir(
rocksdb_options_t* opt, const char* v) {
opt->rep.wal_dir = v;
}
void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) {
opt->rep.WAL_ttl_seconds = ttl;
}
uint64_t rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t* opt) {
return opt->rep.WAL_ttl_seconds;
}
void rocksdb_options_set_WAL_size_limit_MB(
rocksdb_options_t* opt, uint64_t limit) {
opt->rep.WAL_size_limit_MB = limit;
}
uint64_t rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t* opt) {
return opt->rep.WAL_size_limit_MB;
}
void rocksdb_options_set_manifest_preallocation_size(
rocksdb_options_t* opt, size_t v) {
opt->rep.manifest_preallocation_size = v;
}
size_t rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t* opt) {
return opt->rep.manifest_preallocation_size;
}
void rocksdb_options_set_use_direct_reads(rocksdb_options_t* opt,
unsigned char v) {
opt->rep.use_direct_reads = v;
}
unsigned char rocksdb_options_get_use_direct_reads(rocksdb_options_t* opt) {
return opt->rep.use_direct_reads;
}
void rocksdb_options_set_use_direct_io_for_flush_and_compaction(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.use_direct_io_for_flush_and_compaction = v;
}
unsigned char rocksdb_options_get_use_direct_io_for_flush_and_compaction(
rocksdb_options_t* opt) {
return opt->rep.use_direct_io_for_flush_and_compaction;
}
void rocksdb_options_set_allow_mmap_reads(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.allow_mmap_reads = v;
}
unsigned char rocksdb_options_get_allow_mmap_reads(rocksdb_options_t* opt) {
return opt->rep.allow_mmap_reads;
}
void rocksdb_options_set_allow_mmap_writes(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.allow_mmap_writes = v;
}
unsigned char rocksdb_options_get_allow_mmap_writes(rocksdb_options_t* opt) {
return opt->rep.allow_mmap_writes;
}
void rocksdb_options_set_is_fd_close_on_exec(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.is_fd_close_on_exec = v;
}
unsigned char rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t* opt) {
return opt->rep.is_fd_close_on_exec;
}
void rocksdb_options_set_stats_dump_period_sec(
rocksdb_options_t* opt, unsigned int v) {
opt->rep.stats_dump_period_sec = v;
}
unsigned int rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t* opt) {
return opt->rep.stats_dump_period_sec;
}
void rocksdb_options_set_stats_persist_period_sec(rocksdb_options_t* opt,
unsigned int v) {
opt->rep.stats_persist_period_sec = v;
}
unsigned int rocksdb_options_get_stats_persist_period_sec(
rocksdb_options_t* opt) {
return opt->rep.stats_persist_period_sec;
}
void rocksdb_options_set_advise_random_on_open(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.advise_random_on_open = v;
}
unsigned char rocksdb_options_get_advise_random_on_open(
rocksdb_options_t* opt) {
return opt->rep.advise_random_on_open;
}
Memtable sampling for mempurge heuristic. (#8628) Summary: Changes the API of the MemPurge process: the `bool experimental_allow_mempurge` and `experimental_mempurge_policy` flags have been replaced by a `double experimental_mempurge_threshold` option. This change of API reflects another major change introduced in this PR: the MemPurgeDecider() function now works by sampling the memtables being flushed to estimate the overall amount of useful payload (payload minus the garbage), and then compare this useful payload estimate with the `double experimental_mempurge_threshold` value. Therefore, when the value of this flag is `0.0` (default value), mempurge is simply deactivated. On the other hand, a value of `DBL_MAX` would be equivalent to always going through a mempurge regardless of the garbage ratio estimate. At the moment, a `double experimental_mempurge_threshold` value else than 0.0 or `DBL_MAX` is opnly supported`with the `SkipList` memtable representation. Regarding the sampling, this PR includes the introduction of a `MemTable::UniqueRandomSample` function that collects (approximately) random entries from the memtable by using the new `SkipList::Iterator::RandomSeek()` under the hood, or by iterating through each memtable entry, depending on the target sample size and the total number of entries. The unit tests have been readapted to support this new API. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8628 Reviewed By: pdillinger Differential Revision: D30149315 Pulled By: bjlemaire fbshipit-source-id: 1feef5390c95db6f4480ab4434716533d3947f27
3 years ago
void rocksdb_options_set_experimental_mempurge_threshold(rocksdb_options_t* opt,
double v) {
opt->rep.experimental_mempurge_threshold = v;
Memtable "MemPurge" prototype (#8454) Summary: Implement an experimental feature called "MemPurge", which consists in purging "garbage" bytes out of a memtable and reuse the memtable struct instead of making it immutable and eventually flushing its content to storage. The prototype is by default deactivated and is not intended for use. It is intended for correctness and validation testing. At the moment, the "MemPurge" feature can be switched on by using the `options.experimental_allow_mempurge` flag. For this early stage, when the allow_mempurge flag is set to `true`, all the flush operations will be rerouted to perform a MemPurge. This is a temporary design decision that will give us the time to explore meaningful heuristics to use MemPurge at the right time for relevant workloads . Moreover, the current MemPurge operation only supports `Puts`, `Deletes`, `DeleteRange` operations, and handles `Iterators` as well as `CompactionFilter`s that are invoked at flush time . Three unit tests are added to `db_flush_test.cc` to test if MemPurge works correctly (and checks that the previously mentioned operations are fully supported thoroughly tested). One noticeable design decision is the timing of the MemPurge operation in the memtable workflow: for this prototype, the mempurge happens when the memtable is switched (and usually made immutable). This is an inefficient process because it implies that the entirety of the MemPurge operation happens while holding the db_mutex. Future commits will make the MemPurge operation a background task (akin to the regular flush operation) and aim at drastically enhancing the performance of this operation. The MemPurge is also not fully "WAL-compatible" yet, but when the WAL is full, or when the regular MemPurge operation fails (or when the purged memtable still needs to be flushed), a regular flush operation takes place. Later commits will also correct these behaviors. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8454 Reviewed By: anand1976 Differential Revision: D29433971 Pulled By: bjlemaire fbshipit-source-id: 6af48213554e35048a7e03816955100a80a26dc5
4 years ago
}
void rocksdb_options_set_access_hint_on_compaction_start(
rocksdb_options_t* opt, int v) {
switch(v) {
case 0:
opt->rep.access_hint_on_compaction_start =
ROCKSDB_NAMESPACE::Options::NONE;
break;
case 1:
opt->rep.access_hint_on_compaction_start =
ROCKSDB_NAMESPACE::Options::NORMAL;
break;
case 2:
opt->rep.access_hint_on_compaction_start =
ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
break;
case 3:
opt->rep.access_hint_on_compaction_start =
ROCKSDB_NAMESPACE::Options::WILLNEED;
break;
default:
assert(0);
}
}
int rocksdb_options_get_access_hint_on_compaction_start(
rocksdb_options_t* opt) {
return opt->rep.access_hint_on_compaction_start;
}
void rocksdb_options_set_use_adaptive_mutex(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.use_adaptive_mutex = v;
}
unsigned char rocksdb_options_get_use_adaptive_mutex(rocksdb_options_t* opt) {
return opt->rep.use_adaptive_mutex;
}
void rocksdb_options_set_wal_bytes_per_sync(
rocksdb_options_t* opt, uint64_t v) {
opt->rep.wal_bytes_per_sync = v;
}
uint64_t rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t* opt) {
return opt->rep.wal_bytes_per_sync;
}
void rocksdb_options_set_bytes_per_sync(
rocksdb_options_t* opt, uint64_t v) {
opt->rep.bytes_per_sync = v;
}
uint64_t rocksdb_options_get_bytes_per_sync(rocksdb_options_t* opt) {
return opt->rep.bytes_per_sync;
}
void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt,
uint64_t v) {
opt->rep.writable_file_max_buffer_size = static_cast<size_t>(v);
}
uint64_t rocksdb_options_get_writable_file_max_buffer_size(
rocksdb_options_t* opt) {
return opt->rep.writable_file_max_buffer_size;
}
void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt,
unsigned char v) {
opt->rep.allow_concurrent_memtable_write = v;
}
unsigned char rocksdb_options_get_allow_concurrent_memtable_write(
rocksdb_options_t* opt) {
return opt->rep.allow_concurrent_memtable_write;
}
void rocksdb_options_set_enable_write_thread_adaptive_yield(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.enable_write_thread_adaptive_yield = v;
}
unsigned char rocksdb_options_get_enable_write_thread_adaptive_yield(
rocksdb_options_t* opt) {
return opt->rep.enable_write_thread_adaptive_yield;
}
void rocksdb_options_set_max_sequential_skip_in_iterations(
rocksdb_options_t* opt, uint64_t v) {
opt->rep.max_sequential_skip_in_iterations = v;
}
uint64_t rocksdb_options_get_max_sequential_skip_in_iterations(
rocksdb_options_t* opt) {
return opt->rep.max_sequential_skip_in_iterations;
}
void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) {
opt->rep.max_write_buffer_number = n;
}
int rocksdb_options_get_max_write_buffer_number(rocksdb_options_t* opt) {
return opt->rep.max_write_buffer_number;
}
void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) {
opt->rep.min_write_buffer_number_to_merge = n;
}
int rocksdb_options_get_min_write_buffer_number_to_merge(
rocksdb_options_t* opt) {
return opt->rep.min_write_buffer_number_to_merge;
}
void rocksdb_options_set_max_write_buffer_number_to_maintain(
rocksdb_options_t* opt, int n) {
opt->rep.max_write_buffer_number_to_maintain = n;
}
int rocksdb_options_get_max_write_buffer_number_to_maintain(
rocksdb_options_t* opt) {
return opt->rep.max_write_buffer_number_to_maintain;
}
Refactor trimming logic for immutable memtables (#5022) Summary: MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory. We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one. The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming. In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022 Differential Revision: D14394062 Pulled By: miasantreble fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
void rocksdb_options_set_max_write_buffer_size_to_maintain(
rocksdb_options_t* opt, int64_t n) {
opt->rep.max_write_buffer_size_to_maintain = n;
}
int64_t rocksdb_options_get_max_write_buffer_size_to_maintain(
rocksdb_options_t* opt) {
return opt->rep.max_write_buffer_size_to_maintain;
}
void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt,
unsigned char v) {
opt->rep.enable_pipelined_write = v;
}
unsigned char rocksdb_options_get_enable_pipelined_write(
rocksdb_options_t* opt) {
return opt->rep.enable_pipelined_write;
}
Unordered Writes (#5218) Summary: Performing unordered writes in rocksdb when unordered_write option is set to true. When enabled the writes to memtable are done without joining any write thread. This offers much higher write throughput since the upcoming writes would not have to wait for the slowest memtable write to finish. The tradeoff is that the writes visible to a snapshot might change over time. If the application cannot tolerate that, it should implement its own mechanisms to work around that. Using TransactionDB with WRITE_PREPARED write policy is one way to achieve that. Doing so increases the max throughput by 2.2x without however compromising the snapshot guarantees. The patch is prepared based on an original by siying Existing unit tests are extended to include unordered_write option. Benchmark Results: ``` TEST_TMPDIR=/dev/shm/ ./db_bench_unordered --benchmarks=fillrandom --threads=32 --num=10000000 -max_write_buffer_number=16 --max_background_jobs=64 --batch_size=8 --writes=3000000 -level0_file_num_compaction_trigger=99999 --level0_slowdown_writes_trigger=99999 --level0_stop_writes_trigger=99999 -enable_pipelined_write=false -disable_auto_compactions --unordered_write=1 ``` With WAL - Vanilla RocksDB: 78.6 MB/s - WRITER_PREPARED with unordered_write: 177.8 MB/s (2.2x) - unordered_write: 368.9 MB/s (4.7x with relaxed snapshot guarantees) Without WAL - Vanilla RocksDB: 111.3 MB/s - WRITER_PREPARED with unordered_write: 259.3 MB/s MB/s (2.3x) - unordered_write: 645.6 MB/s (5.8x with relaxed snapshot guarantees) - WRITER_PREPARED with unordered_write disable concurrency control: 185.3 MB/s MB/s (2.35x) Limitations: - The feature is not yet extended to `max_successive_merges` > 0. The feature is also incompatible with `enable_pipelined_write` = true as well as with `allow_concurrent_memtable_write` = false. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5218 Differential Revision: D15219029 Pulled By: maysamyabandeh fbshipit-source-id: 38f2abc4af8780148c6128acdba2b3227bc81759
6 years ago
void rocksdb_options_set_unordered_write(rocksdb_options_t* opt,
unsigned char v) {
opt->rep.unordered_write = v;
}
unsigned char rocksdb_options_get_unordered_write(rocksdb_options_t* opt) {
return opt->rep.unordered_write;
}
void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt,
uint32_t n) {
opt->rep.max_subcompactions = n;
}
uint32_t rocksdb_options_get_max_subcompactions(rocksdb_options_t* opt) {
return opt->rep.max_subcompactions;
}
void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) {
opt->rep.max_background_jobs = n;
}
int rocksdb_options_get_max_background_jobs(rocksdb_options_t* opt) {
return opt->rep.max_background_jobs;
}
void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) {
opt->rep.max_background_compactions = n;
}
int rocksdb_options_get_max_background_compactions(rocksdb_options_t* opt) {
return opt->rep.max_background_compactions;
}
void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) {
opt->rep.max_background_flushes = n;
}
int rocksdb_options_get_max_background_flushes(rocksdb_options_t* opt) {
return opt->rep.max_background_flushes;
}
void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) {
opt->rep.max_log_file_size = v;
}
size_t rocksdb_options_get_max_log_file_size(rocksdb_options_t* opt) {
return opt->rep.max_log_file_size;
}
void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt, size_t v) {
opt->rep.log_file_time_to_roll = v;
}
size_t rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t* opt) {
return opt->rep.log_file_time_to_roll;
}
void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) {
opt->rep.keep_log_file_num = v;
}
size_t rocksdb_options_get_keep_log_file_num(rocksdb_options_t* opt) {
return opt->rep.keep_log_file_num;
}
void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt,
size_t v) {
opt->rep.recycle_log_file_num = v;
}
size_t rocksdb_options_get_recycle_log_file_num(rocksdb_options_t* opt) {
return opt->rep.recycle_log_file_num;
}
void rocksdb_options_set_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) {
opt->rep.soft_pending_compaction_bytes_limit = v;
}
size_t rocksdb_options_get_soft_pending_compaction_bytes_limit(
rocksdb_options_t* opt) {
return opt->rep.soft_pending_compaction_bytes_limit;
}
void rocksdb_options_set_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) {
opt->rep.hard_pending_compaction_bytes_limit = v;
}
size_t rocksdb_options_get_hard_pending_compaction_bytes_limit(
rocksdb_options_t* opt) {
return opt->rep.hard_pending_compaction_bytes_limit;
}
void rocksdb_options_set_max_manifest_file_size(
rocksdb_options_t* opt, size_t v) {
opt->rep.max_manifest_file_size = v;
}
size_t rocksdb_options_get_max_manifest_file_size(rocksdb_options_t* opt) {
return opt->rep.max_manifest_file_size;
}
void rocksdb_options_set_table_cache_numshardbits(
rocksdb_options_t* opt, int v) {
opt->rep.table_cache_numshardbits = v;
}
int rocksdb_options_get_table_cache_numshardbits(rocksdb_options_t* opt) {
return opt->rep.table_cache_numshardbits;
}
void rocksdb_options_set_arena_block_size(
rocksdb_options_t* opt, size_t v) {
opt->rep.arena_block_size = v;
}
size_t rocksdb_options_get_arena_block_size(rocksdb_options_t* opt) {
return opt->rep.arena_block_size;
}
void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) {
opt->rep.disable_auto_compactions = disable;
}
unsigned char rocksdb_options_get_disable_auto_compactions(
rocksdb_options_t* opt) {
return opt->rep.disable_auto_compactions;
}
void rocksdb_options_set_optimize_filters_for_hits(rocksdb_options_t* opt, int v) {
opt->rep.optimize_filters_for_hits = v;
}
unsigned char rocksdb_options_get_optimize_filters_for_hits(
rocksdb_options_t* opt) {
return opt->rep.optimize_filters_for_hits;
}
void rocksdb_options_set_delete_obsolete_files_period_micros(
rocksdb_options_t* opt, uint64_t v) {
opt->rep.delete_obsolete_files_period_micros = v;
}
uint64_t rocksdb_options_get_delete_obsolete_files_period_micros(
rocksdb_options_t* opt) {
return opt->rep.delete_obsolete_files_period_micros;
}
void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) {
opt->rep.PrepareForBulkLoad();
}
void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) {
opt->rep.memtable_factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory);
}
void rocksdb_options_set_memtable_prefix_bloom_size_ratio(
rocksdb_options_t* opt, double v) {
opt->rep.memtable_prefix_bloom_size_ratio = v;
}
double rocksdb_options_get_memtable_prefix_bloom_size_ratio(
rocksdb_options_t* opt) {
return opt->rep.memtable_prefix_bloom_size_ratio;
}
void rocksdb_options_set_memtable_huge_page_size(rocksdb_options_t* opt,
size_t v) {
opt->rep.memtable_huge_page_size = v;
}
size_t rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t* opt) {
return opt->rep.memtable_huge_page_size;
}
void rocksdb_options_set_hash_skip_list_rep(
rocksdb_options_t *opt, size_t bucket_count,
int32_t skiplist_height, int32_t skiplist_branching_factor) {
ROCKSDB_NAMESPACE::MemTableRepFactory* factory =
ROCKSDB_NAMESPACE::NewHashSkipListRepFactory(
bucket_count, skiplist_height, skiplist_branching_factor);
opt->rep.memtable_factory.reset(factory);
}
void rocksdb_options_set_hash_link_list_rep(
rocksdb_options_t *opt, size_t bucket_count) {
opt->rep.memtable_factory.reset(
ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count));
}
void rocksdb_options_set_plain_table_factory(
rocksdb_options_t *opt, uint32_t user_key_len, int bloom_bits_per_key,
double hash_table_ratio, size_t index_sparseness) {
ROCKSDB_NAMESPACE::PlainTableOptions options;
options.user_key_len = user_key_len;
options.bloom_bits_per_key = bloom_bits_per_key;
options.hash_table_ratio = hash_table_ratio;
options.index_sparseness = index_sparseness;
ROCKSDB_NAMESPACE::TableFactory* factory =
ROCKSDB_NAMESPACE::NewPlainTableFactory(options);
opt->rep.table_factory.reset(factory);
}
void rocksdb_options_set_max_successive_merges(
rocksdb_options_t* opt, size_t v) {
opt->rep.max_successive_merges = v;
}
size_t rocksdb_options_get_max_successive_merges(rocksdb_options_t* opt) {
return opt->rep.max_successive_merges;
}
void rocksdb_options_set_bloom_locality(
rocksdb_options_t* opt, uint32_t v) {
opt->rep.bloom_locality = v;
}
uint32_t rocksdb_options_get_bloom_locality(rocksdb_options_t* opt) {
return opt->rep.bloom_locality;
}
void rocksdb_options_set_inplace_update_support(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.inplace_update_support = v;
}
unsigned char rocksdb_options_get_inplace_update_support(
rocksdb_options_t* opt) {
return opt->rep.inplace_update_support;
}
void rocksdb_options_set_inplace_update_num_locks(
rocksdb_options_t* opt, size_t v) {
opt->rep.inplace_update_num_locks = v;
}
size_t rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t* opt) {
return opt->rep.inplace_update_num_locks;
}
void rocksdb_options_set_report_bg_io_stats(
rocksdb_options_t* opt, int v) {
opt->rep.report_bg_io_stats = v;
}
unsigned char rocksdb_options_get_report_bg_io_stats(rocksdb_options_t* opt) {
return opt->rep.report_bg_io_stats;
}
void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) {
opt->rep.compaction_style =
static_cast<ROCKSDB_NAMESPACE::CompactionStyle>(style);
}
int rocksdb_options_get_compaction_style(rocksdb_options_t* opt) {
return opt->rep.compaction_style;
}
void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) {
opt->rep.compaction_options_universal = *(uco->rep);
}
void rocksdb_options_set_fifo_compaction_options(
rocksdb_options_t* opt,
rocksdb_fifo_compaction_options_t* fifo) {
opt->rep.compaction_options_fifo = fifo->rep;
}
char *rocksdb_options_statistics_get_string(rocksdb_options_t *opt) {
ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get();
if (statistics) {
return strdup(statistics->ToString().c_str());
}
return nullptr;
}
void rocksdb_options_set_ratelimiter(rocksdb_options_t *opt, rocksdb_ratelimiter_t *limiter) {
if (limiter) {
opt->rep.rate_limiter = limiter->rep;
}
}
void rocksdb_options_set_atomic_flush(rocksdb_options_t* opt,
unsigned char atomic_flush) {
opt->rep.atomic_flush = atomic_flush;
}
unsigned char rocksdb_options_get_atomic_flush(rocksdb_options_t* opt) {
return opt->rep.atomic_flush;
}
void rocksdb_options_set_manual_wal_flush(rocksdb_options_t* opt,
unsigned char manual_wal_flush) {
opt->rep.manual_wal_flush = manual_wal_flush;
}
unsigned char rocksdb_options_get_manual_wal_flush(rocksdb_options_t* opt) {
return opt->rep.manual_wal_flush;
}
void rocksdb_options_set_wal_compression(rocksdb_options_t* opt, int val) {
opt->rep.wal_compression = static_cast<CompressionType>(val);
}
int rocksdb_options_get_wal_compression(rocksdb_options_t* opt) {
return opt->rep.wal_compression;
}
rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
int64_t rate_bytes_per_sec,
int64_t refill_period_us,
int32_t fairness) {
rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t;
rate_limiter->rep.reset(
NewGenericRateLimiter(rate_bytes_per_sec,
refill_period_us, fairness));
return rate_limiter;
}
void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t *limiter) {
delete limiter;
}
void rocksdb_options_set_row_cache(rocksdb_options_t* opt, rocksdb_cache_t* cache) {
if(cache) {
opt->rep.row_cache = cache->rep;
}
}
void rocksdb_options_add_compact_on_deletion_collector_factory(
rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger) {
std::shared_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory>
compact_on_del =
NewCompactOnDeletionCollectorFactory(window_size, num_dels_trigger);
opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
}
void rocksdb_set_perf_level(int v) {
PerfLevel level = static_cast<PerfLevel>(v);
SetPerfLevel(level);
}
rocksdb_perfcontext_t* rocksdb_perfcontext_create() {
rocksdb_perfcontext_t* context = new rocksdb_perfcontext_t;
context->rep = ROCKSDB_NAMESPACE::get_perf_context();
return context;
}
void rocksdb_perfcontext_reset(rocksdb_perfcontext_t* context) {
context->rep->Reset();
}
char* rocksdb_perfcontext_report(rocksdb_perfcontext_t* context,
unsigned char exclude_zero_counters) {
return strdup(context->rep->ToString(exclude_zero_counters).c_str());
}
uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context,
int metric) {
PerfContext* rep = context->rep;
switch (metric) {
case rocksdb_user_key_comparison_count:
return rep->user_key_comparison_count;
case rocksdb_block_cache_hit_count:
return rep->block_cache_hit_count;
case rocksdb_block_read_count:
return rep->block_read_count;
case rocksdb_block_read_byte:
return rep->block_read_byte;
case rocksdb_block_read_time:
return rep->block_read_time;
case rocksdb_block_checksum_time:
return rep->block_checksum_time;
case rocksdb_block_decompress_time:
return rep->block_decompress_time;
case rocksdb_get_read_bytes:
return rep->get_read_bytes;
case rocksdb_multiget_read_bytes:
return rep->multiget_read_bytes;
case rocksdb_iter_read_bytes:
return rep->iter_read_bytes;
case rocksdb_internal_key_skipped_count:
return rep->internal_key_skipped_count;
case rocksdb_internal_delete_skipped_count:
return rep->internal_delete_skipped_count;
case rocksdb_internal_recent_skipped_count:
return rep->internal_recent_skipped_count;
case rocksdb_internal_merge_count:
return rep->internal_merge_count;
case rocksdb_get_snapshot_time:
return rep->get_snapshot_time;
case rocksdb_get_from_memtable_time:
return rep->get_from_memtable_time;
case rocksdb_get_from_memtable_count:
return rep->get_from_memtable_count;
case rocksdb_get_post_process_time:
return rep->get_post_process_time;
case rocksdb_get_from_output_files_time:
return rep->get_from_output_files_time;
case rocksdb_seek_on_memtable_time:
return rep->seek_on_memtable_time;
case rocksdb_seek_on_memtable_count:
return rep->seek_on_memtable_count;
case rocksdb_next_on_memtable_count:
return rep->next_on_memtable_count;
case rocksdb_prev_on_memtable_count:
return rep->prev_on_memtable_count;
case rocksdb_seek_child_seek_time:
return rep->seek_child_seek_time;
case rocksdb_seek_child_seek_count:
return rep->seek_child_seek_count;
case rocksdb_seek_min_heap_time:
return rep->seek_min_heap_time;
case rocksdb_seek_max_heap_time:
return rep->seek_max_heap_time;
case rocksdb_seek_internal_seek_time:
return rep->seek_internal_seek_time;
case rocksdb_find_next_user_entry_time:
return rep->find_next_user_entry_time;
case rocksdb_write_wal_time:
return rep->write_wal_time;
case rocksdb_write_memtable_time:
return rep->write_memtable_time;
case rocksdb_write_delay_time:
return rep->write_delay_time;
case rocksdb_write_pre_and_post_process_time:
return rep->write_pre_and_post_process_time;
case rocksdb_db_mutex_lock_nanos:
return rep->db_mutex_lock_nanos;
case rocksdb_db_condition_wait_nanos:
return rep->db_condition_wait_nanos;
case rocksdb_merge_operator_time_nanos:
return rep->merge_operator_time_nanos;
case rocksdb_read_index_block_nanos:
return rep->read_index_block_nanos;
case rocksdb_read_filter_block_nanos:
return rep->read_filter_block_nanos;
case rocksdb_new_table_block_iter_nanos:
return rep->new_table_block_iter_nanos;
case rocksdb_new_table_iterator_nanos:
return rep->new_table_iterator_nanos;
case rocksdb_block_seek_nanos:
return rep->block_seek_nanos;
case rocksdb_find_table_nanos:
return rep->find_table_nanos;
case rocksdb_bloom_memtable_hit_count:
return rep->bloom_memtable_hit_count;
case rocksdb_bloom_memtable_miss_count:
return rep->bloom_memtable_miss_count;
case rocksdb_bloom_sst_hit_count:
return rep->bloom_sst_hit_count;
case rocksdb_bloom_sst_miss_count:
return rep->bloom_sst_miss_count;
case rocksdb_key_lock_wait_time:
return rep->key_lock_wait_time;
case rocksdb_key_lock_wait_count:
return rep->key_lock_wait_count;
case rocksdb_env_new_sequential_file_nanos:
return rep->env_new_sequential_file_nanos;
case rocksdb_env_new_random_access_file_nanos:
return rep->env_new_random_access_file_nanos;
case rocksdb_env_new_writable_file_nanos:
return rep->env_new_writable_file_nanos;
case rocksdb_env_reuse_writable_file_nanos:
return rep->env_reuse_writable_file_nanos;
case rocksdb_env_new_random_rw_file_nanos:
return rep->env_new_random_rw_file_nanos;
case rocksdb_env_new_directory_nanos:
return rep->env_new_directory_nanos;
case rocksdb_env_file_exists_nanos:
return rep->env_file_exists_nanos;
case rocksdb_env_get_children_nanos:
return rep->env_get_children_nanos;
case rocksdb_env_get_children_file_attributes_nanos:
return rep->env_get_children_file_attributes_nanos;
case rocksdb_env_delete_file_nanos:
return rep->env_delete_file_nanos;
case rocksdb_env_create_dir_nanos:
return rep->env_create_dir_nanos;
case rocksdb_env_create_dir_if_missing_nanos:
return rep->env_create_dir_if_missing_nanos;
case rocksdb_env_delete_dir_nanos:
return rep->env_delete_dir_nanos;
case rocksdb_env_get_file_size_nanos:
return rep->env_get_file_size_nanos;
case rocksdb_env_get_file_modification_time_nanos:
return rep->env_get_file_modification_time_nanos;
case rocksdb_env_rename_file_nanos:
return rep->env_rename_file_nanos;
case rocksdb_env_link_file_nanos:
return rep->env_link_file_nanos;
case rocksdb_env_lock_file_nanos:
return rep->env_lock_file_nanos;
case rocksdb_env_unlock_file_nanos:
return rep->env_unlock_file_nanos;
case rocksdb_env_new_logger_nanos:
return rep->env_new_logger_nanos;
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
3 years ago
case rocksdb_number_async_seek:
return rep->number_async_seek;
default:
break;
}
return 0;
}
void rocksdb_perfcontext_destroy(rocksdb_perfcontext_t* context) {
delete context;
}
/*
TODO:
DB::OpenForReadOnly
DB::KeyMayExist
DB::GetOptions
DB::GetSortedWalFiles
DB::GetLatestSequenceNumber
DB::GetUpdatesSince
DB::GetDbIdentity
DB::RunManualCompaction
custom cache
table_properties_collectors
*/
rocksdb_compactionfilter_t* rocksdb_compactionfilter_create(
void* state,
void (*destructor)(void*),
unsigned char (*filter)(
void*,
int level,
const char* key, size_t key_length,
const char* existing_value, size_t value_length,
char** new_value, size_t *new_value_length,
unsigned char* value_changed),
const char* (*name)(void*)) {
rocksdb_compactionfilter_t* result = new rocksdb_compactionfilter_t;
result->state_ = state;
result->destructor_ = destructor;
result->filter_ = filter;
result->ignore_snapshots_ = true;
result->name_ = name;
return result;
}
void rocksdb_compactionfilter_set_ignore_snapshots(
rocksdb_compactionfilter_t* filter,
unsigned char whether_ignore) {
filter->ignore_snapshots_ = whether_ignore;
}
void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t* filter) {
delete filter;
}
unsigned char rocksdb_compactionfiltercontext_is_full_compaction(
rocksdb_compactionfiltercontext_t* context) {
return context->rep.is_full_compaction;
}
unsigned char rocksdb_compactionfiltercontext_is_manual_compaction(
rocksdb_compactionfiltercontext_t* context) {
return context->rep.is_manual_compaction;
}
rocksdb_compactionfilterfactory_t* rocksdb_compactionfilterfactory_create(
void* state, void (*destructor)(void*),
rocksdb_compactionfilter_t* (*create_compaction_filter)(
void*, rocksdb_compactionfiltercontext_t* context),
const char* (*name)(void*)) {
rocksdb_compactionfilterfactory_t* result =
new rocksdb_compactionfilterfactory_t;
result->state_ = state;
result->destructor_ = destructor;
result->create_compaction_filter_ = create_compaction_filter;
result->name_ = name;
return result;
}
void rocksdb_compactionfilterfactory_destroy(
rocksdb_compactionfilterfactory_t* factory) {
delete factory;
}
rocksdb_comparator_t* rocksdb_comparator_create(
void* state,
void (*destructor)(void*),
int (*compare)(
void*,
const char* a, size_t alen,
const char* b, size_t blen),
const char* (*name)(void*)) {
rocksdb_comparator_t* result = new rocksdb_comparator_t;
result->state_ = state;
result->destructor_ = destructor;
result->compare_ = compare;
result->name_ = name;
result->compare_ts_ = nullptr;
result->compare_without_ts_ = nullptr;
return result;
}
void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) { delete cmp; }
rocksdb_comparator_t* rocksdb_comparator_with_ts_create(
void* state, void (*destructor)(void*),
int (*compare)(void*, const char* a, size_t alen, const char* b,
size_t blen),
int (*compare_ts)(void*, const char* a_ts, size_t a_tslen, const char* b_ts,
size_t b_tslen),
int (*compare_without_ts)(void*, const char* a, size_t alen,
unsigned char a_has_ts, const char* b,
size_t blen, unsigned char b_has_ts),
const char* (*name)(void*), size_t timestamp_size) {
rocksdb_comparator_t* result = new rocksdb_comparator_t(timestamp_size);
result->state_ = state;
result->destructor_ = destructor;
result->compare_ = compare;
result->compare_ts_ = compare_ts;
result->compare_without_ts_ = compare_without_ts;
result->name_ = name;
return result;
}
void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) {
delete filter;
}
Add Bloom/Ribbon hybrid API support (#8679) Summary: This is essentially resurrection and fixing of the part of https://github.com/facebook/rocksdb/issues/8198 that was reverted in https://github.com/facebook/rocksdb/issues/8212, using data added in https://github.com/facebook/rocksdb/issues/8246. Basically, when configuring Ribbon filter, you can specify an LSM level before which Bloom will be used instead of Ribbon. But Bloom is only considered for Leveled and Universal compaction styles and file going into a known LSM level. This way, SST file writer, FIFO compaction, etc. use Ribbon filter as you would expect with NewRibbonFilterPolicy. So that this can be controlled with a single int value and so that flushes can be distinguished from intra-L0, we consider flush to go to level -1 for the purposes of this option. (Explained in API comment.) I also expect the most common and recommended Ribbon configuration to use Bloom during flush, to minimize slowing down writes and because according to my estimates, Ribbon only pays off if the structure lives in memory for more than an hour. Thus, I have changed the default for NewRibbonFilterPolicy to be this mild hybrid configuration. I don't really want to add something like NewHybridFilterPolicy because at least the mild hybrid configuration (Bloom for flush, Ribbon otherwise) should be considered a natural choice. C APIs also updated, but because they don't support overloading, rocksdb_filterpolicy_create_ribbon is kept pure ribbon for clarity and rocksdb_filterpolicy_create_ribbon_hybrid must be called for a hybrid configuration. While touching C API, I changed bits per key options from int to double. BuiltinFilterPolicy is needed so that LevelThresholdFilterPolicy doesn't inherit unused fields from BloomFilterPolicy. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8679 Test Plan: new + updated tests, including crash test Reviewed By: jay-zhuang Differential Revision: D30445797 Pulled By: pdillinger fbshipit-source-id: 6f5aeddfd6d79f7e55493b563c2d1d2d568892e1
3 years ago
rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format(
double bits_per_key, bool original_format) {
// Make a rocksdb_filterpolicy_t, but override all of its methods so
// they delegate to a NewBloomFilterPolicy() instead of user
// supplied C functions.
struct Wrapper : public rocksdb_filterpolicy_t {
const FilterPolicy* rep_;
~Wrapper() override { delete rep_; }
const char* Name() const override { return rep_->Name(); }
Fix a major performance bug in 7.0 re: filter compatibility (#9736) Summary: Bloom filters generated by pre-7.0 releases are not read by 7.0.x releases (and vice-versa) due to changes to FilterPolicy::Name() in https://github.com/facebook/rocksdb/issues/9590. This can severely impact read performance and read I/O on upgrade or downgrade with existing DB, but not data correctness. To fix, we go back using the old, unified name in SST metadata but (for a while anyway) recognize the aliases that could be generated by early 7.0.x releases. This unfortunately requires a public API change to avoid interfering with all the good changes from https://github.com/facebook/rocksdb/issues/9590, but the API change only affects users with custom FilterPolicy, which should be very few. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9736 Test Plan: manual Generate DBs with ``` ./db_bench.7.0 -db=/dev/shm/rocksdb.7.0 -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=fillrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 ``` and similar. Compare with ``` for IMPL in 6.29 7.0 fixed; do for DB in 6.29 7.0 fixed; do echo "Testing $IMPL on $DB:"; ./db_bench.$IMPL -db=/dev/shm/rocksdb.$DB -use_existing_db -readonly -bloom_bits=10 -benchmarks=readrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -duration=10 2>&1 | grep micros/op; done; done ``` Results: ``` Testing 6.29 on 6.29: readrandom : 34.381 micros/op 29085 ops/sec; 3.2 MB/s (291999 of 291999 found) Testing 6.29 on 7.0: readrandom : 190.443 micros/op 5249 ops/sec; 0.6 MB/s (52999 of 52999 found) Testing 6.29 on fixed: readrandom : 40.148 micros/op 24907 ops/sec; 2.8 MB/s (249999 of 249999 found) Testing 7.0 on 6.29: readrandom : 229.430 micros/op 4357 ops/sec; 0.5 MB/s (43999 of 43999 found) Testing 7.0 on 7.0: readrandom : 33.348 micros/op 29986 ops/sec; 3.3 MB/s (299999 of 299999 found) Testing 7.0 on fixed: readrandom : 152.734 micros/op 6546 ops/sec; 0.7 MB/s (65999 of 65999 found) Testing fixed on 6.29: readrandom : 32.024 micros/op 31224 ops/sec; 3.5 MB/s (312999 of 312999 found) Testing fixed on 7.0: readrandom : 33.990 micros/op 29390 ops/sec; 3.3 MB/s (294999 of 294999 found) Testing fixed on fixed: readrandom : 28.714 micros/op 34825 ops/sec; 3.9 MB/s (348999 of 348999 found) ``` Just paying attention to order of magnitude of ops/sec (short test durations, lots of noise), it's clear that with the fix we can read <= 6.29 & >= 7.0 at full speed, where neither 6.29 nor 7.0 can on both. And 6.29 release can properly read fixed DB at full speed. Reviewed By: siying, ajkr Differential Revision: D35057844 Pulled By: pdillinger fbshipit-source-id: a46893a6af4bf084375ebe4728066d00eb08f050
3 years ago
const char* CompatibilityName() const override {
return rep_->CompatibilityName();
}
// No need to override GetFilterBitsBuilder if this one is overridden
ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext(
const ROCKSDB_NAMESPACE::FilterBuildingContext& context)
const override {
return rep_->GetBuilderWithContext(context);
}
ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader(
const Slice& contents) const override {
return rep_->GetFilterBitsReader(contents);
}
static void DoNothing(void*) {}
};
Wrapper* wrapper = new Wrapper;
wrapper->rep_ = NewBloomFilterPolicy(bits_per_key, original_format);
wrapper->state_ = nullptr;
wrapper->destructor_ = &Wrapper::DoNothing;
return wrapper;
}
Add Bloom/Ribbon hybrid API support (#8679) Summary: This is essentially resurrection and fixing of the part of https://github.com/facebook/rocksdb/issues/8198 that was reverted in https://github.com/facebook/rocksdb/issues/8212, using data added in https://github.com/facebook/rocksdb/issues/8246. Basically, when configuring Ribbon filter, you can specify an LSM level before which Bloom will be used instead of Ribbon. But Bloom is only considered for Leveled and Universal compaction styles and file going into a known LSM level. This way, SST file writer, FIFO compaction, etc. use Ribbon filter as you would expect with NewRibbonFilterPolicy. So that this can be controlled with a single int value and so that flushes can be distinguished from intra-L0, we consider flush to go to level -1 for the purposes of this option. (Explained in API comment.) I also expect the most common and recommended Ribbon configuration to use Bloom during flush, to minimize slowing down writes and because according to my estimates, Ribbon only pays off if the structure lives in memory for more than an hour. Thus, I have changed the default for NewRibbonFilterPolicy to be this mild hybrid configuration. I don't really want to add something like NewHybridFilterPolicy because at least the mild hybrid configuration (Bloom for flush, Ribbon otherwise) should be considered a natural choice. C APIs also updated, but because they don't support overloading, rocksdb_filterpolicy_create_ribbon is kept pure ribbon for clarity and rocksdb_filterpolicy_create_ribbon_hybrid must be called for a hybrid configuration. While touching C API, I changed bits per key options from int to double. BuiltinFilterPolicy is needed so that LevelThresholdFilterPolicy doesn't inherit unused fields from BloomFilterPolicy. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8679 Test Plan: new + updated tests, including crash test Reviewed By: jay-zhuang Differential Revision: D30445797 Pulled By: pdillinger fbshipit-source-id: 6f5aeddfd6d79f7e55493b563c2d1d2d568892e1
3 years ago
rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full(
double bits_per_key) {
return rocksdb_filterpolicy_create_bloom_format(bits_per_key, false);
}
Add Bloom/Ribbon hybrid API support (#8679) Summary: This is essentially resurrection and fixing of the part of https://github.com/facebook/rocksdb/issues/8198 that was reverted in https://github.com/facebook/rocksdb/issues/8212, using data added in https://github.com/facebook/rocksdb/issues/8246. Basically, when configuring Ribbon filter, you can specify an LSM level before which Bloom will be used instead of Ribbon. But Bloom is only considered for Leveled and Universal compaction styles and file going into a known LSM level. This way, SST file writer, FIFO compaction, etc. use Ribbon filter as you would expect with NewRibbonFilterPolicy. So that this can be controlled with a single int value and so that flushes can be distinguished from intra-L0, we consider flush to go to level -1 for the purposes of this option. (Explained in API comment.) I also expect the most common and recommended Ribbon configuration to use Bloom during flush, to minimize slowing down writes and because according to my estimates, Ribbon only pays off if the structure lives in memory for more than an hour. Thus, I have changed the default for NewRibbonFilterPolicy to be this mild hybrid configuration. I don't really want to add something like NewHybridFilterPolicy because at least the mild hybrid configuration (Bloom for flush, Ribbon otherwise) should be considered a natural choice. C APIs also updated, but because they don't support overloading, rocksdb_filterpolicy_create_ribbon is kept pure ribbon for clarity and rocksdb_filterpolicy_create_ribbon_hybrid must be called for a hybrid configuration. While touching C API, I changed bits per key options from int to double. BuiltinFilterPolicy is needed so that LevelThresholdFilterPolicy doesn't inherit unused fields from BloomFilterPolicy. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8679 Test Plan: new + updated tests, including crash test Reviewed By: jay-zhuang Differential Revision: D30445797 Pulled By: pdillinger fbshipit-source-id: 6f5aeddfd6d79f7e55493b563c2d1d2d568892e1
3 years ago
rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(double bits_per_key) {
return rocksdb_filterpolicy_create_bloom_format(bits_per_key, true);
}
rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_format(
Add Bloom/Ribbon hybrid API support (#8679) Summary: This is essentially resurrection and fixing of the part of https://github.com/facebook/rocksdb/issues/8198 that was reverted in https://github.com/facebook/rocksdb/issues/8212, using data added in https://github.com/facebook/rocksdb/issues/8246. Basically, when configuring Ribbon filter, you can specify an LSM level before which Bloom will be used instead of Ribbon. But Bloom is only considered for Leveled and Universal compaction styles and file going into a known LSM level. This way, SST file writer, FIFO compaction, etc. use Ribbon filter as you would expect with NewRibbonFilterPolicy. So that this can be controlled with a single int value and so that flushes can be distinguished from intra-L0, we consider flush to go to level -1 for the purposes of this option. (Explained in API comment.) I also expect the most common and recommended Ribbon configuration to use Bloom during flush, to minimize slowing down writes and because according to my estimates, Ribbon only pays off if the structure lives in memory for more than an hour. Thus, I have changed the default for NewRibbonFilterPolicy to be this mild hybrid configuration. I don't really want to add something like NewHybridFilterPolicy because at least the mild hybrid configuration (Bloom for flush, Ribbon otherwise) should be considered a natural choice. C APIs also updated, but because they don't support overloading, rocksdb_filterpolicy_create_ribbon is kept pure ribbon for clarity and rocksdb_filterpolicy_create_ribbon_hybrid must be called for a hybrid configuration. While touching C API, I changed bits per key options from int to double. BuiltinFilterPolicy is needed so that LevelThresholdFilterPolicy doesn't inherit unused fields from BloomFilterPolicy. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8679 Test Plan: new + updated tests, including crash test Reviewed By: jay-zhuang Differential Revision: D30445797 Pulled By: pdillinger fbshipit-source-id: 6f5aeddfd6d79f7e55493b563c2d1d2d568892e1
3 years ago
double bloom_equivalent_bits_per_key, int bloom_before_level) {
// Make a rocksdb_filterpolicy_t, but override all of its methods so
// they delegate to a NewRibbonFilterPolicy() instead of user
// supplied C functions.
struct Wrapper : public rocksdb_filterpolicy_t {
const FilterPolicy* rep_;
~Wrapper() override { delete rep_; }
const char* Name() const override { return rep_->Name(); }
Fix a major performance bug in 7.0 re: filter compatibility (#9736) Summary: Bloom filters generated by pre-7.0 releases are not read by 7.0.x releases (and vice-versa) due to changes to FilterPolicy::Name() in https://github.com/facebook/rocksdb/issues/9590. This can severely impact read performance and read I/O on upgrade or downgrade with existing DB, but not data correctness. To fix, we go back using the old, unified name in SST metadata but (for a while anyway) recognize the aliases that could be generated by early 7.0.x releases. This unfortunately requires a public API change to avoid interfering with all the good changes from https://github.com/facebook/rocksdb/issues/9590, but the API change only affects users with custom FilterPolicy, which should be very few. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9736 Test Plan: manual Generate DBs with ``` ./db_bench.7.0 -db=/dev/shm/rocksdb.7.0 -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=fillrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 ``` and similar. Compare with ``` for IMPL in 6.29 7.0 fixed; do for DB in 6.29 7.0 fixed; do echo "Testing $IMPL on $DB:"; ./db_bench.$IMPL -db=/dev/shm/rocksdb.$DB -use_existing_db -readonly -bloom_bits=10 -benchmarks=readrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -duration=10 2>&1 | grep micros/op; done; done ``` Results: ``` Testing 6.29 on 6.29: readrandom : 34.381 micros/op 29085 ops/sec; 3.2 MB/s (291999 of 291999 found) Testing 6.29 on 7.0: readrandom : 190.443 micros/op 5249 ops/sec; 0.6 MB/s (52999 of 52999 found) Testing 6.29 on fixed: readrandom : 40.148 micros/op 24907 ops/sec; 2.8 MB/s (249999 of 249999 found) Testing 7.0 on 6.29: readrandom : 229.430 micros/op 4357 ops/sec; 0.5 MB/s (43999 of 43999 found) Testing 7.0 on 7.0: readrandom : 33.348 micros/op 29986 ops/sec; 3.3 MB/s (299999 of 299999 found) Testing 7.0 on fixed: readrandom : 152.734 micros/op 6546 ops/sec; 0.7 MB/s (65999 of 65999 found) Testing fixed on 6.29: readrandom : 32.024 micros/op 31224 ops/sec; 3.5 MB/s (312999 of 312999 found) Testing fixed on 7.0: readrandom : 33.990 micros/op 29390 ops/sec; 3.3 MB/s (294999 of 294999 found) Testing fixed on fixed: readrandom : 28.714 micros/op 34825 ops/sec; 3.9 MB/s (348999 of 348999 found) ``` Just paying attention to order of magnitude of ops/sec (short test durations, lots of noise), it's clear that with the fix we can read <= 6.29 & >= 7.0 at full speed, where neither 6.29 nor 7.0 can on both. And 6.29 release can properly read fixed DB at full speed. Reviewed By: siying, ajkr Differential Revision: D35057844 Pulled By: pdillinger fbshipit-source-id: a46893a6af4bf084375ebe4728066d00eb08f050
3 years ago
const char* CompatibilityName() const override {
return rep_->CompatibilityName();
}
ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext(
const ROCKSDB_NAMESPACE::FilterBuildingContext& context)
const override {
return rep_->GetBuilderWithContext(context);
}
ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader(
const Slice& contents) const override {
return rep_->GetFilterBitsReader(contents);
}
static void DoNothing(void*) {}
};
Wrapper* wrapper = new Wrapper;
Add Bloom/Ribbon hybrid API support (#8679) Summary: This is essentially resurrection and fixing of the part of https://github.com/facebook/rocksdb/issues/8198 that was reverted in https://github.com/facebook/rocksdb/issues/8212, using data added in https://github.com/facebook/rocksdb/issues/8246. Basically, when configuring Ribbon filter, you can specify an LSM level before which Bloom will be used instead of Ribbon. But Bloom is only considered for Leveled and Universal compaction styles and file going into a known LSM level. This way, SST file writer, FIFO compaction, etc. use Ribbon filter as you would expect with NewRibbonFilterPolicy. So that this can be controlled with a single int value and so that flushes can be distinguished from intra-L0, we consider flush to go to level -1 for the purposes of this option. (Explained in API comment.) I also expect the most common and recommended Ribbon configuration to use Bloom during flush, to minimize slowing down writes and because according to my estimates, Ribbon only pays off if the structure lives in memory for more than an hour. Thus, I have changed the default for NewRibbonFilterPolicy to be this mild hybrid configuration. I don't really want to add something like NewHybridFilterPolicy because at least the mild hybrid configuration (Bloom for flush, Ribbon otherwise) should be considered a natural choice. C APIs also updated, but because they don't support overloading, rocksdb_filterpolicy_create_ribbon is kept pure ribbon for clarity and rocksdb_filterpolicy_create_ribbon_hybrid must be called for a hybrid configuration. While touching C API, I changed bits per key options from int to double. BuiltinFilterPolicy is needed so that LevelThresholdFilterPolicy doesn't inherit unused fields from BloomFilterPolicy. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8679 Test Plan: new + updated tests, including crash test Reviewed By: jay-zhuang Differential Revision: D30445797 Pulled By: pdillinger fbshipit-source-id: 6f5aeddfd6d79f7e55493b563c2d1d2d568892e1
3 years ago
wrapper->rep_ =
NewRibbonFilterPolicy(bloom_equivalent_bits_per_key, bloom_before_level);
wrapper->state_ = nullptr;
wrapper->destructor_ = &Wrapper::DoNothing;
return wrapper;
}
rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon(
Add Bloom/Ribbon hybrid API support (#8679) Summary: This is essentially resurrection and fixing of the part of https://github.com/facebook/rocksdb/issues/8198 that was reverted in https://github.com/facebook/rocksdb/issues/8212, using data added in https://github.com/facebook/rocksdb/issues/8246. Basically, when configuring Ribbon filter, you can specify an LSM level before which Bloom will be used instead of Ribbon. But Bloom is only considered for Leveled and Universal compaction styles and file going into a known LSM level. This way, SST file writer, FIFO compaction, etc. use Ribbon filter as you would expect with NewRibbonFilterPolicy. So that this can be controlled with a single int value and so that flushes can be distinguished from intra-L0, we consider flush to go to level -1 for the purposes of this option. (Explained in API comment.) I also expect the most common and recommended Ribbon configuration to use Bloom during flush, to minimize slowing down writes and because according to my estimates, Ribbon only pays off if the structure lives in memory for more than an hour. Thus, I have changed the default for NewRibbonFilterPolicy to be this mild hybrid configuration. I don't really want to add something like NewHybridFilterPolicy because at least the mild hybrid configuration (Bloom for flush, Ribbon otherwise) should be considered a natural choice. C APIs also updated, but because they don't support overloading, rocksdb_filterpolicy_create_ribbon is kept pure ribbon for clarity and rocksdb_filterpolicy_create_ribbon_hybrid must be called for a hybrid configuration. While touching C API, I changed bits per key options from int to double. BuiltinFilterPolicy is needed so that LevelThresholdFilterPolicy doesn't inherit unused fields from BloomFilterPolicy. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8679 Test Plan: new + updated tests, including crash test Reviewed By: jay-zhuang Differential Revision: D30445797 Pulled By: pdillinger fbshipit-source-id: 6f5aeddfd6d79f7e55493b563c2d1d2d568892e1
3 years ago
double bloom_equivalent_bits_per_key) {
return rocksdb_filterpolicy_create_ribbon_format(
Add Bloom/Ribbon hybrid API support (#8679) Summary: This is essentially resurrection and fixing of the part of https://github.com/facebook/rocksdb/issues/8198 that was reverted in https://github.com/facebook/rocksdb/issues/8212, using data added in https://github.com/facebook/rocksdb/issues/8246. Basically, when configuring Ribbon filter, you can specify an LSM level before which Bloom will be used instead of Ribbon. But Bloom is only considered for Leveled and Universal compaction styles and file going into a known LSM level. This way, SST file writer, FIFO compaction, etc. use Ribbon filter as you would expect with NewRibbonFilterPolicy. So that this can be controlled with a single int value and so that flushes can be distinguished from intra-L0, we consider flush to go to level -1 for the purposes of this option. (Explained in API comment.) I also expect the most common and recommended Ribbon configuration to use Bloom during flush, to minimize slowing down writes and because according to my estimates, Ribbon only pays off if the structure lives in memory for more than an hour. Thus, I have changed the default for NewRibbonFilterPolicy to be this mild hybrid configuration. I don't really want to add something like NewHybridFilterPolicy because at least the mild hybrid configuration (Bloom for flush, Ribbon otherwise) should be considered a natural choice. C APIs also updated, but because they don't support overloading, rocksdb_filterpolicy_create_ribbon is kept pure ribbon for clarity and rocksdb_filterpolicy_create_ribbon_hybrid must be called for a hybrid configuration. While touching C API, I changed bits per key options from int to double. BuiltinFilterPolicy is needed so that LevelThresholdFilterPolicy doesn't inherit unused fields from BloomFilterPolicy. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8679 Test Plan: new + updated tests, including crash test Reviewed By: jay-zhuang Differential Revision: D30445797 Pulled By: pdillinger fbshipit-source-id: 6f5aeddfd6d79f7e55493b563c2d1d2d568892e1
3 years ago
bloom_equivalent_bits_per_key, /*bloom_before_level = disabled*/ -1);
}
rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_hybrid(
double bloom_equivalent_bits_per_key, int bloom_before_level) {
return rocksdb_filterpolicy_create_ribbon_format(
bloom_equivalent_bits_per_key, bloom_before_level);
}
rocksdb_mergeoperator_t* rocksdb_mergeoperator_create(
void* state, void (*destructor)(void*),
char* (*full_merge)(void*, const char* key, size_t key_length,
const char* existing_value,
size_t existing_value_length,
const char* const* operands_list,
const size_t* operands_list_length, int num_operands,
unsigned char* success, size_t* new_value_length),
char* (*partial_merge)(void*, const char* key, size_t key_length,
const char* const* operands_list,
const size_t* operands_list_length, int num_operands,
unsigned char* success, size_t* new_value_length),
void (*delete_value)(void*, const char* value, size_t value_length),
const char* (*name)(void*)) {
rocksdb_mergeoperator_t* result = new rocksdb_mergeoperator_t;
result->state_ = state;
result->destructor_ = destructor;
result->full_merge_ = full_merge;
result->partial_merge_ = partial_merge;
result->delete_value_ = delete_value;
result->name_ = name;
return result;
}
void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t* merge_operator) {
delete merge_operator;
}
rocksdb_readoptions_t* rocksdb_readoptions_create() {
return new rocksdb_readoptions_t;
}
void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) {
delete opt;
}
void rocksdb_readoptions_set_verify_checksums(
rocksdb_readoptions_t* opt,
unsigned char v) {
opt->rep.verify_checksums = v;
}
unsigned char rocksdb_readoptions_get_verify_checksums(
rocksdb_readoptions_t* opt) {
return opt->rep.verify_checksums;
}
void rocksdb_readoptions_set_fill_cache(
rocksdb_readoptions_t* opt, unsigned char v) {
opt->rep.fill_cache = v;
}
unsigned char rocksdb_readoptions_get_fill_cache(rocksdb_readoptions_t* opt) {
return opt->rep.fill_cache;
}
void rocksdb_readoptions_set_snapshot(
rocksdb_readoptions_t* opt,
const rocksdb_snapshot_t* snap) {
opt->rep.snapshot = (snap ? snap->rep : nullptr);
}
void rocksdb_readoptions_set_iterate_upper_bound(
rocksdb_readoptions_t* opt,
const char* key, size_t keylen) {
if (key == nullptr) {
opt->upper_bound = Slice();
opt->rep.iterate_upper_bound = nullptr;
} else {
opt->upper_bound = Slice(key, keylen);
opt->rep.iterate_upper_bound = &opt->upper_bound;
}
}
void rocksdb_readoptions_set_iterate_lower_bound(
rocksdb_readoptions_t *opt,
const char* key, size_t keylen) {
if (key == nullptr) {
opt->lower_bound = Slice();
opt->rep.iterate_lower_bound = nullptr;
} else {
opt->lower_bound = Slice(key, keylen);
opt->rep.iterate_lower_bound = &opt->lower_bound;
}
}
void rocksdb_readoptions_set_read_tier(
rocksdb_readoptions_t* opt, int v) {
opt->rep.read_tier = static_cast<ROCKSDB_NAMESPACE::ReadTier>(v);
}
int rocksdb_readoptions_get_read_tier(rocksdb_readoptions_t* opt) {
return static_cast<int>(opt->rep.read_tier);
}
void rocksdb_readoptions_set_tailing(
rocksdb_readoptions_t* opt, unsigned char v) {
opt->rep.tailing = v;
}
unsigned char rocksdb_readoptions_get_tailing(rocksdb_readoptions_t* opt) {
return opt->rep.tailing;
}
void rocksdb_readoptions_set_managed(
rocksdb_readoptions_t* opt, unsigned char v) {
opt->rep.managed = v;
}
void rocksdb_readoptions_set_readahead_size(
rocksdb_readoptions_t* opt, size_t v) {
opt->rep.readahead_size = v;
}
size_t rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t* opt) {
return opt->rep.readahead_size;
}
void rocksdb_readoptions_set_prefix_same_as_start(
rocksdb_readoptions_t* opt, unsigned char v) {
opt->rep.prefix_same_as_start = v;
}
unsigned char rocksdb_readoptions_get_prefix_same_as_start(
rocksdb_readoptions_t* opt) {
return opt->rep.prefix_same_as_start;
}
void rocksdb_readoptions_set_pin_data(rocksdb_readoptions_t* opt,
unsigned char v) {
opt->rep.pin_data = v;
}
unsigned char rocksdb_readoptions_get_pin_data(rocksdb_readoptions_t* opt) {
return opt->rep.pin_data;
}
void rocksdb_readoptions_set_total_order_seek(rocksdb_readoptions_t* opt,
unsigned char v) {
opt->rep.total_order_seek = v;
}
unsigned char rocksdb_readoptions_get_total_order_seek(
rocksdb_readoptions_t* opt) {
return opt->rep.total_order_seek;
}
void rocksdb_readoptions_set_max_skippable_internal_keys(
rocksdb_readoptions_t* opt,
uint64_t v) {
opt->rep.max_skippable_internal_keys = v;
}
uint64_t rocksdb_readoptions_get_max_skippable_internal_keys(
rocksdb_readoptions_t* opt) {
return opt->rep.max_skippable_internal_keys;
}
void rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
rocksdb_readoptions_t* opt, unsigned char v) {
opt->rep.background_purge_on_iterator_cleanup = v;
}
unsigned char rocksdb_readoptions_get_background_purge_on_iterator_cleanup(
rocksdb_readoptions_t* opt) {
return opt->rep.background_purge_on_iterator_cleanup;
}
void rocksdb_readoptions_set_ignore_range_deletions(
rocksdb_readoptions_t* opt, unsigned char v) {
opt->rep.ignore_range_deletions = v;
}
unsigned char rocksdb_readoptions_get_ignore_range_deletions(
rocksdb_readoptions_t* opt) {
return opt->rep.ignore_range_deletions;
}
void rocksdb_readoptions_set_deadline(rocksdb_readoptions_t* opt,
uint64_t microseconds) {
opt->rep.deadline = std::chrono::microseconds(microseconds);
}
uint64_t rocksdb_readoptions_get_deadline(rocksdb_readoptions_t* opt) {
return opt->rep.deadline.count();
}
void rocksdb_readoptions_set_io_timeout(rocksdb_readoptions_t* opt,
uint64_t microseconds) {
opt->rep.io_timeout = std::chrono::microseconds(microseconds);
}
extern ROCKSDB_LIBRARY_API uint64_t
rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) {
return opt->rep.io_timeout.count();
}
void rocksdb_readoptions_set_timestamp(rocksdb_readoptions_t* opt,
const char* ts, size_t tslen) {
if (ts == nullptr) {
opt->timestamp = Slice();
opt->rep.timestamp = nullptr;
} else {
opt->timestamp = Slice(ts, tslen);
opt->rep.timestamp = &opt->timestamp;
}
}
void rocksdb_readoptions_set_iter_start_ts(rocksdb_readoptions_t* opt,
const char* ts, size_t tslen) {
if (ts == nullptr) {
opt->iter_start_ts = Slice();
opt->rep.iter_start_ts = nullptr;
} else {
opt->iter_start_ts = Slice(ts, tslen);
opt->rep.iter_start_ts = &opt->iter_start_ts;
}
}
rocksdb_writeoptions_t* rocksdb_writeoptions_create() {
return new rocksdb_writeoptions_t;
}
void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) {
delete opt;
}
void rocksdb_writeoptions_set_sync(
rocksdb_writeoptions_t* opt, unsigned char v) {
opt->rep.sync = v;
}
unsigned char rocksdb_writeoptions_get_sync(rocksdb_writeoptions_t* opt) {
return opt->rep.sync;
}
void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt,
int disable) {
opt->rep.disableWAL = disable;
}
unsigned char rocksdb_writeoptions_get_disable_WAL(
rocksdb_writeoptions_t* opt) {
return opt->rep.disableWAL;
}
void rocksdb_writeoptions_set_ignore_missing_column_families(
rocksdb_writeoptions_t* opt, unsigned char v) {
opt->rep.ignore_missing_column_families = v;
}
unsigned char rocksdb_writeoptions_get_ignore_missing_column_families(
rocksdb_writeoptions_t* opt) {
return opt->rep.ignore_missing_column_families;
}
void rocksdb_writeoptions_set_no_slowdown(rocksdb_writeoptions_t* opt,
unsigned char v) {
opt->rep.no_slowdown = v;
}
unsigned char rocksdb_writeoptions_get_no_slowdown(
rocksdb_writeoptions_t* opt) {
return opt->rep.no_slowdown;
}
void rocksdb_writeoptions_set_low_pri(rocksdb_writeoptions_t* opt,
unsigned char v) {
opt->rep.low_pri = v;
}
unsigned char rocksdb_writeoptions_get_low_pri(rocksdb_writeoptions_t* opt) {
return opt->rep.low_pri;
}
void rocksdb_writeoptions_set_memtable_insert_hint_per_batch(
rocksdb_writeoptions_t* opt, unsigned char v) {
opt->rep.memtable_insert_hint_per_batch = v;
}
unsigned char rocksdb_writeoptions_get_memtable_insert_hint_per_batch(
rocksdb_writeoptions_t* opt) {
return opt->rep.memtable_insert_hint_per_batch;
}
rocksdb_compactoptions_t* rocksdb_compactoptions_create() {
return new rocksdb_compactoptions_t;
}
void rocksdb_compactoptions_destroy(rocksdb_compactoptions_t* opt) {
delete opt;
}
void rocksdb_compactoptions_set_bottommost_level_compaction(
rocksdb_compactoptions_t* opt, unsigned char v) {
opt->rep.bottommost_level_compaction = static_cast<BottommostLevelCompaction>(v);
}
unsigned char rocksdb_compactoptions_get_bottommost_level_compaction(
rocksdb_compactoptions_t* opt) {
return static_cast<unsigned char>(opt->rep.bottommost_level_compaction);
}
void rocksdb_compactoptions_set_exclusive_manual_compaction(
rocksdb_compactoptions_t* opt, unsigned char v) {
opt->rep.exclusive_manual_compaction = v;
}
unsigned char rocksdb_compactoptions_get_exclusive_manual_compaction(
rocksdb_compactoptions_t* opt) {
return opt->rep.exclusive_manual_compaction;
}
void rocksdb_compactoptions_set_change_level(rocksdb_compactoptions_t* opt,
unsigned char v) {
opt->rep.change_level = v;
}
unsigned char rocksdb_compactoptions_get_change_level(
rocksdb_compactoptions_t* opt) {
return opt->rep.change_level;
}
void rocksdb_compactoptions_set_target_level(rocksdb_compactoptions_t* opt,
int n) {
opt->rep.target_level = n;
}
int rocksdb_compactoptions_get_target_level(rocksdb_compactoptions_t* opt) {
return opt->rep.target_level;
}
void rocksdb_compactoptions_set_full_history_ts_low(
rocksdb_compactoptions_t* opt, char* ts, size_t tslen) {
if (ts == nullptr) {
opt->full_history_ts_low = Slice();
opt->rep.full_history_ts_low = nullptr;
} else {
opt->full_history_ts_low = Slice(ts, tslen);
opt->rep.full_history_ts_low = &opt->full_history_ts_low;
}
}
rocksdb_flushoptions_t* rocksdb_flushoptions_create() {
return new rocksdb_flushoptions_t;
}
void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) {
delete opt;
}
void rocksdb_flushoptions_set_wait(
rocksdb_flushoptions_t* opt, unsigned char v) {
opt->rep.wait = v;
}
unsigned char rocksdb_flushoptions_get_wait(rocksdb_flushoptions_t* opt) {
return opt->rep.wait;
}
rocksdb_memory_allocator_t* rocksdb_jemalloc_nodump_allocator_create(
char** errptr) {
rocksdb_memory_allocator_t* allocator = new rocksdb_memory_allocator_t;
ROCKSDB_NAMESPACE::JemallocAllocatorOptions options;
SaveError(errptr, ROCKSDB_NAMESPACE::NewJemallocNodumpAllocator(
options, &allocator->rep));
return allocator;
}
void rocksdb_memory_allocator_destroy(rocksdb_memory_allocator_t* allocator) {
delete allocator;
}
rocksdb_lru_cache_options_t* rocksdb_lru_cache_options_create() {
return new rocksdb_lru_cache_options_t;
}
void rocksdb_lru_cache_options_destroy(rocksdb_lru_cache_options_t* opt) {
delete opt;
}
void rocksdb_lru_cache_options_set_capacity(rocksdb_lru_cache_options_t* opt,
size_t capacity) {
opt->rep.capacity = capacity;
}
void rocksdb_lru_cache_options_set_memory_allocator(
rocksdb_lru_cache_options_t* opt, rocksdb_memory_allocator_t* allocator) {
opt->rep.memory_allocator = allocator->rep;
}
rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) {
rocksdb_cache_t* c = new rocksdb_cache_t;
c->rep = NewLRUCache(capacity);
return c;
}
rocksdb_cache_t* rocksdb_cache_create_lru_with_strict_capacity_limit(
size_t capacity) {
rocksdb_cache_t* c = new rocksdb_cache_t;
c->rep = NewLRUCache(capacity);
c->rep->SetStrictCapacityLimit(true);
return c;
}
rocksdb_cache_t* rocksdb_cache_create_lru_opts(
rocksdb_lru_cache_options_t* opt) {
rocksdb_cache_t* c = new rocksdb_cache_t;
c->rep = NewLRUCache(opt->rep);
return c;
}
void rocksdb_cache_destroy(rocksdb_cache_t* cache) {
delete cache;
}
void rocksdb_cache_disown_data(rocksdb_cache_t* cache) {
cache->rep->DisownData();
}
void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) {
cache->rep->SetCapacity(capacity);
}
size_t rocksdb_cache_get_capacity(rocksdb_cache_t* cache) {
return cache->rep->GetCapacity();
}
size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) {
return cache->rep->GetUsage();
}
size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) {
return cache->rep->GetPinnedUsage();
}
rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, uint64_t target_size) {
rocksdb_dbpath_t* result = new rocksdb_dbpath_t;
result->rep.path = std::string(path);
result->rep.target_size = target_size;
return result;
}
void rocksdb_dbpath_destroy(rocksdb_dbpath_t* dbpath) {
delete dbpath;
}
rocksdb_env_t* rocksdb_create_default_env() {
rocksdb_env_t* result = new rocksdb_env_t;
result->rep = Env::Default();
result->is_default = true;
return result;
}
rocksdb_env_t* rocksdb_create_mem_env() {
rocksdb_env_t* result = new rocksdb_env_t;
result->rep = ROCKSDB_NAMESPACE::NewMemEnv(Env::Default());
result->is_default = false;
return result;
}
void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) {
env->rep->SetBackgroundThreads(n);
}
int rocksdb_env_get_background_threads(rocksdb_env_t* env) {
return env->rep->GetBackgroundThreads();
}
void rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env,
int n) {
env->rep->SetBackgroundThreads(n, Env::BOTTOM);
}
int rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env) {
return env->rep->GetBackgroundThreads(Env::BOTTOM);
}
void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) {
env->rep->SetBackgroundThreads(n, Env::HIGH);
}
int rocksdb_env_get_high_priority_background_threads(rocksdb_env_t* env) {
return env->rep->GetBackgroundThreads(Env::HIGH);
}
void rocksdb_env_set_low_priority_background_threads(rocksdb_env_t* env,
int n) {
env->rep->SetBackgroundThreads(n, Env::LOW);
}
int rocksdb_env_get_low_priority_background_threads(rocksdb_env_t* env) {
return env->rep->GetBackgroundThreads(Env::LOW);
}
void rocksdb_env_join_all_threads(rocksdb_env_t* env) {
env->rep->WaitForJoin();
}
void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) {
env->rep->LowerThreadPoolIOPriority();
}
void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env) {
env->rep->LowerThreadPoolIOPriority(Env::HIGH);
}
void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) {
env->rep->LowerThreadPoolCPUPriority();
}
void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env) {
env->rep->LowerThreadPoolCPUPriority(Env::HIGH);
}
void rocksdb_env_destroy(rocksdb_env_t* env) {
if (!env->is_default) delete env->rep;
delete env;
}
rocksdb_envoptions_t* rocksdb_envoptions_create() {
rocksdb_envoptions_t* opt = new rocksdb_envoptions_t;
return opt;
}
void rocksdb_envoptions_destroy(rocksdb_envoptions_t* opt) { delete opt; }
rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create(
const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options) {
rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t;
writer->rep = new SstFileWriter(env->rep, io_options->rep);
return writer;
}
void rocksdb_create_dir_if_missing(rocksdb_env_t* env, const char* path,
char** errptr) {
SaveError(errptr, env->rep->CreateDirIfMissing(std::string(path)));
}
rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create_with_comparator(
const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options,
const rocksdb_comparator_t* /*comparator*/) {
rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t;
writer->rep = new SstFileWriter(env->rep, io_options->rep);
return writer;
}
void rocksdb_sstfilewriter_open(rocksdb_sstfilewriter_t* writer,
const char* name, char** errptr) {
SaveError(errptr, writer->rep->Open(std::string(name)));
}
void rocksdb_sstfilewriter_add(rocksdb_sstfilewriter_t* writer, const char* key,
size_t keylen, const char* val, size_t vallen,
char** errptr) {
SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen)));
}
void rocksdb_sstfilewriter_put(rocksdb_sstfilewriter_t* writer, const char* key,
size_t keylen, const char* val, size_t vallen,
char** errptr) {
SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen)));
}
void rocksdb_sstfilewriter_put_with_ts(rocksdb_sstfilewriter_t* writer,
const char* key, size_t keylen,
const char* ts, size_t tslen,
const char* val, size_t vallen,
char** errptr) {
SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(ts, tslen),
Slice(val, vallen)));
}
void rocksdb_sstfilewriter_merge(rocksdb_sstfilewriter_t* writer,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr) {
SaveError(errptr, writer->rep->Merge(Slice(key, keylen), Slice(val, vallen)));
}
void rocksdb_sstfilewriter_delete(rocksdb_sstfilewriter_t* writer,
const char* key, size_t keylen,
char** errptr) {
SaveError(errptr, writer->rep->Delete(Slice(key, keylen)));
}
void rocksdb_sstfilewriter_delete_with_ts(rocksdb_sstfilewriter_t* writer,
const char* key, size_t keylen,
const char* ts, size_t tslen,
char** errptr) {
SaveError(errptr, writer->rep->Delete(Slice(key, keylen), Slice(ts, tslen)));
}
void rocksdb_sstfilewriter_finish(rocksdb_sstfilewriter_t* writer,
char** errptr) {
SaveError(errptr, writer->rep->Finish(nullptr));
}
void rocksdb_sstfilewriter_file_size(rocksdb_sstfilewriter_t* writer,
uint64_t* file_size) {
*file_size = writer->rep->FileSize();
}
void rocksdb_sstfilewriter_destroy(rocksdb_sstfilewriter_t* writer) {
delete writer->rep;
delete writer;
}
rocksdb_ingestexternalfileoptions_t*
rocksdb_ingestexternalfileoptions_create() {
rocksdb_ingestexternalfileoptions_t* opt =
new rocksdb_ingestexternalfileoptions_t;
return opt;
}
void rocksdb_ingestexternalfileoptions_set_move_files(
rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files) {
opt->rep.move_files = move_files;
}
void rocksdb_ingestexternalfileoptions_set_snapshot_consistency(
rocksdb_ingestexternalfileoptions_t* opt,
unsigned char snapshot_consistency) {
opt->rep.snapshot_consistency = snapshot_consistency;
}
void rocksdb_ingestexternalfileoptions_set_allow_global_seqno(
rocksdb_ingestexternalfileoptions_t* opt,
unsigned char allow_global_seqno) {
opt->rep.allow_global_seqno = allow_global_seqno;
}
void rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(
rocksdb_ingestexternalfileoptions_t* opt,
unsigned char allow_blocking_flush) {
opt->rep.allow_blocking_flush = allow_blocking_flush;
}
void rocksdb_ingestexternalfileoptions_set_ingest_behind(
rocksdb_ingestexternalfileoptions_t* opt, unsigned char ingest_behind) {
opt->rep.ingest_behind = ingest_behind;
}
void rocksdb_ingestexternalfileoptions_destroy(
rocksdb_ingestexternalfileoptions_t* opt) {
delete opt;
}
void rocksdb_ingest_external_file(
rocksdb_t* db, const char* const* file_list, const size_t list_len,
const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) {
std::vector<std::string> files(list_len);
for (size_t i = 0; i < list_len; ++i) {
files[i] = std::string(file_list[i]);
}
SaveError(errptr, db->rep->IngestExternalFile(files, opt->rep));
}
void rocksdb_ingest_external_file_cf(
rocksdb_t* db, rocksdb_column_family_handle_t* handle,
const char* const* file_list, const size_t list_len,
const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) {
std::vector<std::string> files(list_len);
for (size_t i = 0; i < list_len; ++i) {
files[i] = std::string(file_list[i]);
}
SaveError(errptr, db->rep->IngestExternalFile(handle->rep, files, opt->rep));
}
void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) {
SaveError(errptr, db->rep->TryCatchUpWithPrimary());
}
rocksdb_slicetransform_t* rocksdb_slicetransform_create(
void* state,
void (*destructor)(void*),
char* (*transform)(
void*,
const char* key, size_t length,
size_t* dst_length),
unsigned char (*in_domain)(
void*,
const char* key, size_t length),
unsigned char (*in_range)(
void*,
const char* key, size_t length),
const char* (*name)(void*)) {
rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t;
result->state_ = state;
result->destructor_ = destructor;
result->transform_ = transform;
result->in_domain_ = in_domain;
result->in_range_ = in_range;
result->name_ = name;
return result;
}
void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) {
delete st;
}
struct SliceTransformWrapper : public rocksdb_slicetransform_t {
const SliceTransform* rep_;
~SliceTransformWrapper() override { delete rep_; }
const char* Name() const override { return rep_->Name(); }
std::string GetId() const override { return rep_->GetId(); }
Slice Transform(const Slice& src) const override {
return rep_->Transform(src);
}
bool InDomain(const Slice& src) const override {
return rep_->InDomain(src);
}
bool InRange(const Slice& src) const override { return rep_->InRange(src); }
static void DoNothing(void*) { }
};
rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t prefixLen) {
SliceTransformWrapper* wrapper = new SliceTransformWrapper;
wrapper->rep_ = ROCKSDB_NAMESPACE::NewFixedPrefixTransform(prefixLen);
wrapper->state_ = nullptr;
wrapper->destructor_ = &SliceTransformWrapper::DoNothing;
return wrapper;
}
rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() {
SliceTransformWrapper* wrapper = new SliceTransformWrapper;
wrapper->rep_ = ROCKSDB_NAMESPACE::NewNoopTransform();
wrapper->state_ = nullptr;
wrapper->destructor_ = &SliceTransformWrapper::DoNothing;
return wrapper;
}
rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() {
rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t;
result->rep = new ROCKSDB_NAMESPACE::CompactionOptionsUniversal;
return result;
}
void rocksdb_universal_compaction_options_set_size_ratio(
rocksdb_universal_compaction_options_t* uco, int ratio) {
uco->rep->size_ratio = ratio;
}
int rocksdb_universal_compaction_options_get_size_ratio(
rocksdb_universal_compaction_options_t* uco) {
return uco->rep->size_ratio;
}
void rocksdb_universal_compaction_options_set_min_merge_width(
rocksdb_universal_compaction_options_t* uco, int w) {
uco->rep->min_merge_width = w;
}
int rocksdb_universal_compaction_options_get_min_merge_width(
rocksdb_universal_compaction_options_t* uco) {
return uco->rep->min_merge_width;
}
void rocksdb_universal_compaction_options_set_max_merge_width(
rocksdb_universal_compaction_options_t* uco, int w) {
uco->rep->max_merge_width = w;
}
int rocksdb_universal_compaction_options_get_max_merge_width(
rocksdb_universal_compaction_options_t* uco) {
return uco->rep->max_merge_width;
}
void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
rocksdb_universal_compaction_options_t* uco, int p) {
uco->rep->max_size_amplification_percent = p;
}
int rocksdb_universal_compaction_options_get_max_size_amplification_percent(
rocksdb_universal_compaction_options_t* uco) {
return uco->rep->max_size_amplification_percent;
}
void rocksdb_universal_compaction_options_set_compression_size_percent(
rocksdb_universal_compaction_options_t* uco, int p) {
uco->rep->compression_size_percent = p;
}
int rocksdb_universal_compaction_options_get_compression_size_percent(
rocksdb_universal_compaction_options_t* uco) {
return uco->rep->compression_size_percent;
}
void rocksdb_universal_compaction_options_set_stop_style(
rocksdb_universal_compaction_options_t* uco, int style) {
uco->rep->stop_style =
static_cast<ROCKSDB_NAMESPACE::CompactionStopStyle>(style);
}
int rocksdb_universal_compaction_options_get_stop_style(
rocksdb_universal_compaction_options_t* uco) {
return static_cast<int>(uco->rep->stop_style);
}
void rocksdb_universal_compaction_options_destroy(
rocksdb_universal_compaction_options_t* uco) {
delete uco->rep;
delete uco;
}
rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() {
rocksdb_fifo_compaction_options_t* result = new rocksdb_fifo_compaction_options_t;
result->rep = CompactionOptionsFIFO();
return result;
}
void rocksdb_fifo_compaction_options_set_max_table_files_size(
rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) {
fifo_opts->rep.max_table_files_size = size;
}
uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size(
rocksdb_fifo_compaction_options_t* fifo_opts) {
return fifo_opts->rep.max_table_files_size;
}
void rocksdb_fifo_compaction_options_destroy(
rocksdb_fifo_compaction_options_t* fifo_opts) {
delete fifo_opts;
}
void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level) {
if (level >= 0) {
assert(level <= opt->rep.num_levels);
opt->rep.compression_per_level.resize(opt->rep.num_levels);
for (int i = 0; i < level; i++) {
opt->rep.compression_per_level[i] = ROCKSDB_NAMESPACE::kNoCompression;
}
for (int i = level; i < opt->rep.num_levels; i++) {
opt->rep.compression_per_level[i] = opt->rep.compression;
}
}
}
int rocksdb_livefiles_count(
const rocksdb_livefiles_t* lf) {
return static_cast<int>(lf->rep.size());
}
const char* rocksdb_livefiles_column_family_name(const rocksdb_livefiles_t* lf,
int index) {
return lf->rep[index].column_family_name.c_str();
}
const char* rocksdb_livefiles_name(
const rocksdb_livefiles_t* lf,
int index) {
return lf->rep[index].name.c_str();
}
int rocksdb_livefiles_level(
const rocksdb_livefiles_t* lf,
int index) {
return lf->rep[index].level;
}
size_t rocksdb_livefiles_size(
const rocksdb_livefiles_t* lf,
int index) {
return lf->rep[index].size;
}
const char* rocksdb_livefiles_smallestkey(
const rocksdb_livefiles_t* lf,
int index,
size_t* size) {
*size = lf->rep[index].smallestkey.size();
return lf->rep[index].smallestkey.data();
}
const char* rocksdb_livefiles_largestkey(
const rocksdb_livefiles_t* lf,
int index,
size_t* size) {
*size = lf->rep[index].largestkey.size();
return lf->rep[index].largestkey.data();
}
uint64_t rocksdb_livefiles_entries(
const rocksdb_livefiles_t* lf,
int index) {
return lf->rep[index].num_entries;
}
uint64_t rocksdb_livefiles_deletions(
const rocksdb_livefiles_t* lf,
int index) {
return lf->rep[index].num_deletions;
}
extern void rocksdb_livefiles_destroy(
const rocksdb_livefiles_t* lf) {
delete lf;
}
void rocksdb_get_options_from_string(const rocksdb_options_t* base_options,
const char* opts_str,
rocksdb_options_t* new_options,
char** errptr) {
SaveError(errptr,
GetOptionsFromString(base_options->rep, std::string(opts_str),
&new_options->rep));
}
void rocksdb_delete_file_in_range(rocksdb_t* db, const char* start_key,
size_t start_key_len, const char* limit_key,
size_t limit_key_len, char** errptr) {
Slice a, b;
SaveError(
errptr,
DeleteFilesInRange(
db->rep, db->rep->DefaultColumnFamily(),
(start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
(limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)));
}
void rocksdb_delete_file_in_range_cf(
rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
const char* start_key, size_t start_key_len, const char* limit_key,
size_t limit_key_len, char** errptr) {
Slice a, b;
SaveError(
errptr,
DeleteFilesInRange(
db->rep, column_family->rep,
(start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
(limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)));
}
rocksdb_transactiondb_options_t* rocksdb_transactiondb_options_create() {
return new rocksdb_transactiondb_options_t;
}
void rocksdb_transactiondb_options_destroy(rocksdb_transactiondb_options_t* opt){
delete opt;
}
void rocksdb_transactiondb_options_set_max_num_locks(
rocksdb_transactiondb_options_t* opt, int64_t max_num_locks) {
opt->rep.max_num_locks = max_num_locks;
}
void rocksdb_transactiondb_options_set_num_stripes(
rocksdb_transactiondb_options_t* opt, size_t num_stripes) {
opt->rep.num_stripes = num_stripes;
}
void rocksdb_transactiondb_options_set_transaction_lock_timeout(
rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout) {
opt->rep.transaction_lock_timeout = txn_lock_timeout;
}
void rocksdb_transactiondb_options_set_default_lock_timeout(
rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout) {
opt->rep.default_lock_timeout = default_lock_timeout;
}
rocksdb_transaction_options_t* rocksdb_transaction_options_create() {
return new rocksdb_transaction_options_t;
}
void rocksdb_transaction_options_destroy(rocksdb_transaction_options_t* opt) {
delete opt;
}
void rocksdb_transaction_options_set_set_snapshot(
rocksdb_transaction_options_t* opt, unsigned char v) {
opt->rep.set_snapshot = v;
}
void rocksdb_transaction_options_set_deadlock_detect(
rocksdb_transaction_options_t* opt, unsigned char v) {
opt->rep.deadlock_detect = v;
}
void rocksdb_transaction_options_set_lock_timeout(
rocksdb_transaction_options_t* opt, int64_t lock_timeout) {
opt->rep.lock_timeout = lock_timeout;
}
void rocksdb_transaction_options_set_expiration(
rocksdb_transaction_options_t* opt, int64_t expiration) {
opt->rep.expiration = expiration;
}
void rocksdb_transaction_options_set_deadlock_detect_depth(
rocksdb_transaction_options_t* opt, int64_t depth) {
opt->rep.deadlock_detect_depth = depth;
}
void rocksdb_transaction_options_set_max_write_batch_size(
rocksdb_transaction_options_t* opt, size_t size) {
opt->rep.max_write_batch_size = size;
}
void rocksdb_transaction_options_set_skip_prepare(
rocksdb_transaction_options_t* opt, unsigned char v) {
opt->rep.skip_prepare = v;
}
rocksdb_optimistictransaction_options_t*
rocksdb_optimistictransaction_options_create() {
return new rocksdb_optimistictransaction_options_t;
}
void rocksdb_optimistictransaction_options_destroy(
rocksdb_optimistictransaction_options_t* opt) {
delete opt;
}
void rocksdb_optimistictransaction_options_set_set_snapshot(
rocksdb_optimistictransaction_options_t* opt, unsigned char v) {
opt->rep.set_snapshot = v;
}
char* rocksdb_optimistictransactiondb_property_value(
rocksdb_optimistictransactiondb_t* db, const char* propname) {
std::string tmp;
if (db->rep->GetProperty(Slice(propname), &tmp)) {
// We use strdup() since we expect human readable output.
return strdup(tmp.c_str());
} else {
return nullptr;
}
}
int rocksdb_optimistictransactiondb_property_int(
rocksdb_optimistictransactiondb_t* db, const char* propname,
uint64_t* out_val) {
if (db->rep->GetIntProperty(Slice(propname), out_val)) {
return 0;
} else {
return -1;
}
}
rocksdb_column_family_handle_t* rocksdb_transactiondb_create_column_family(
rocksdb_transactiondb_t* txn_db,
const rocksdb_options_t* column_family_options,
const char* column_family_name, char** errptr) {
rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
SaveError(errptr, txn_db->rep->CreateColumnFamily(
ColumnFamilyOptions(column_family_options->rep),
std::string(column_family_name), &(handle->rep)));
return handle;
}
rocksdb_transactiondb_t* rocksdb_transactiondb_open(
const rocksdb_options_t* options,
const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
char** errptr) {
TransactionDB* txn_db;
if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep,
std::string(name), &txn_db))) {
return nullptr;
}
rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t;
result->rep = txn_db;
return result;
}
rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families(
const rocksdb_options_t* options,
const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
int num_column_families, const char* const* column_family_names,
const rocksdb_options_t* const* column_family_options,
rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
std::vector<ColumnFamilyDescriptor> column_families;
for (int i = 0; i < num_column_families; i++) {
column_families.push_back(ColumnFamilyDescriptor(
std::string(column_family_names[i]),
ColumnFamilyOptions(column_family_options[i]->rep)));
}
TransactionDB* txn_db;
std::vector<ColumnFamilyHandle*> handles;
if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep,
std::string(name), column_families,
&handles, &txn_db))) {
return nullptr;
}
for (size_t i = 0; i < handles.size(); i++) {
rocksdb_column_family_handle_t* c_handle =
new rocksdb_column_family_handle_t;
c_handle->rep = handles[i];
column_family_handles[i] = c_handle;
}
rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t;
result->rep = txn_db;
return result;
}
const rocksdb_snapshot_t* rocksdb_transactiondb_create_snapshot(
rocksdb_transactiondb_t* txn_db) {
rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
result->rep = txn_db->rep->GetSnapshot();
return result;
}
void rocksdb_transactiondb_release_snapshot(
rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot) {
txn_db->rep->ReleaseSnapshot(snapshot->rep);
delete snapshot;
}
char* rocksdb_transactiondb_property_value(rocksdb_transactiondb_t* db,
const char* propname) {
std::string tmp;
if (db->rep->GetProperty(Slice(propname), &tmp)) {
// We use strdup() since we expect human readable output.
return strdup(tmp.c_str());
} else {
return nullptr;
}
}
int rocksdb_transactiondb_property_int(rocksdb_transactiondb_t* db,
const char* propname,
uint64_t* out_val) {
if (db->rep->GetIntProperty(Slice(propname), out_val)) {
return 0;
} else {
return -1;
}
}
rocksdb_transaction_t* rocksdb_transaction_begin(
rocksdb_transactiondb_t* txn_db,
const rocksdb_writeoptions_t* write_options,
const rocksdb_transaction_options_t* txn_options,
rocksdb_transaction_t* old_txn) {
if (old_txn == nullptr) {
rocksdb_transaction_t* result = new rocksdb_transaction_t;
result->rep = txn_db->rep->BeginTransaction(write_options->rep,
txn_options->rep, nullptr);
return result;
}
old_txn->rep = txn_db->rep->BeginTransaction(write_options->rep,
txn_options->rep, old_txn->rep);
return old_txn;
}
rocksdb_transaction_t** rocksdb_transactiondb_get_prepared_transactions(
rocksdb_transactiondb_t* txn_db, size_t* cnt) {
std::vector<Transaction*> txns;
txn_db->rep->GetAllPreparedTransactions(&txns);
*cnt = txns.size();
if (txns.empty()) {
return nullptr;
} else {
rocksdb_transaction_t** buf = (rocksdb_transaction_t**)malloc(
txns.size() * sizeof(rocksdb_transaction_t*));
for (size_t i = 0; i < txns.size(); i++) {
buf[i] = new rocksdb_transaction_t;
buf[i]->rep = txns[i];
}
return buf;
}
}
void rocksdb_transaction_set_name(rocksdb_transaction_t* txn, const char* name,
size_t name_len, char** errptr) {
std::string str = std::string(name, name_len);
SaveError(errptr, txn->rep->SetName(str));
}
char* rocksdb_transaction_get_name(rocksdb_transaction_t* txn,
size_t* name_len) {
auto name = txn->rep->GetName();
*name_len = name.size();
return CopyString(name);
}
void rocksdb_transaction_prepare(rocksdb_transaction_t* txn, char** errptr) {
SaveError(errptr, txn->rep->Prepare());
}
rocksdb_writebatch_wi_t* rocksdb_transaction_get_writebatch_wi(
rocksdb_transaction_t* txn) {
rocksdb_writebatch_wi_t* wi =
(rocksdb_writebatch_wi_t*)malloc(sizeof(rocksdb_writebatch_wi_t));
wi->rep = txn->rep->GetWriteBatch();
return wi;
}
void rocksdb_transaction_rebuild_from_writebatch(
rocksdb_transaction_t* txn, rocksdb_writebatch_t* writebatch,
char** errptr) {
SaveError(errptr, txn->rep->RebuildFromWriteBatch(&writebatch->rep));
}
void rocksdb_transaction_rebuild_from_writebatch_wi(rocksdb_transaction_t* txn,
rocksdb_writebatch_wi_t* wi,
char** errptr) {
SaveError(errptr, txn->rep->RebuildFromWriteBatch(wi->rep->GetWriteBatch()));
}
void rocksdb_transaction_commit(rocksdb_transaction_t* txn, char** errptr) {
SaveError(errptr, txn->rep->Commit());
}
void rocksdb_transaction_rollback(rocksdb_transaction_t* txn, char** errptr) {
SaveError(errptr, txn->rep->Rollback());
}
void rocksdb_transaction_set_savepoint(rocksdb_transaction_t* txn) {
txn->rep->SetSavePoint();
}
void rocksdb_transaction_rollback_to_savepoint(rocksdb_transaction_t* txn, char** errptr) {
SaveError(errptr, txn->rep->RollbackToSavePoint());
}
void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) {
delete txn->rep;
delete txn;
}
const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot(
rocksdb_transaction_t* txn) {
// This will be freed later on using free, so use malloc here to avoid a
// mismatch
rocksdb_snapshot_t* result =
(rocksdb_snapshot_t*)malloc(sizeof(rocksdb_snapshot_t));
result->rep = txn->rep->GetSnapshot();
return result;
}
// Read a key inside a transaction
char* rocksdb_transaction_get(rocksdb_transaction_t* txn,
const rocksdb_readoptions_t* options,
const char* key, size_t klen, size_t* vlen,
char** errptr) {
char* result = nullptr;
std::string tmp;
Status s = txn->rep->Get(options->rep, Slice(key, klen), &tmp);
if (s.ok()) {
*vlen = tmp.size();
result = CopyString(tmp);
} else {
*vlen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
const char* key, size_t klen, char** errptr) {
rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
Status s = txn->rep->Get(options->rep, Slice(key, klen), &v->rep);
if (!s.ok()) {
delete (v);
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
return nullptr;
}
return v;
}
char* rocksdb_transaction_get_cf(rocksdb_transaction_t* txn,
const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen, size_t* vlen,
char** errptr) {
char* result = nullptr;
std::string tmp;
Status s =
txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), &tmp);
if (s.ok()) {
*vlen = tmp.size();
result = CopyString(tmp);
} else {
*vlen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_cf(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
char** errptr) {
rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
Status s = txn->rep->Get(options->rep, column_family->rep, Slice(key, klen),
&v->rep);
if (!s.ok()) {
delete (v);
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
return nullptr;
}
return v;
}
// Read a key inside a transaction
char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn,
const rocksdb_readoptions_t* options,
const char* key, size_t klen,
size_t* vlen, unsigned char exclusive,
char** errptr) {
char* result = nullptr;
std::string tmp;
Status s =
txn->rep->GetForUpdate(options->rep, Slice(key, klen), &tmp, exclusive);
if (s.ok()) {
*vlen = tmp.size();
result = CopyString(tmp);
} else {
*vlen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_for_update(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
const char* key, size_t klen, unsigned char exclusive, char** errptr) {
rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
Status s = txn->rep->GetForUpdate(options->rep, Slice(key, klen),
v->rep.GetSelf(), exclusive);
v->rep.PinSelf();
if (!s.ok()) {
delete (v);
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
return nullptr;
}
return v;
}
char* rocksdb_transaction_get_for_update_cf(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
size_t* vlen, unsigned char exclusive, char** errptr) {
char* result = nullptr;
std::string tmp;
Status s = txn->rep->GetForUpdate(options->rep, column_family->rep,
Slice(key, klen), &tmp, exclusive);
if (s.ok()) {
*vlen = tmp.size();
result = CopyString(tmp);
} else {
*vlen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_for_update_cf(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
unsigned char exclusive, char** errptr) {
rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
Status s = txn->rep->GetForUpdate(options->rep, column_family->rep,
Slice(key, klen), &v->rep, exclusive);
if (!s.ok()) {
delete (v);
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
return nullptr;
}
return v;
}
void rocksdb_transaction_multi_get(rocksdb_transaction_t* txn,
const rocksdb_readoptions_t* options,
size_t num_keys,
const char* const* keys_list,
const size_t* keys_list_sizes,
char** values_list,
size_t* values_list_sizes, char** errs) {
std::vector<Slice> keys(num_keys);
for (size_t i = 0; i < num_keys; i++) {
keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<std::string> values(num_keys);
std::vector<Status> statuses =
txn->rep->MultiGet(options->rep, keys, &values);
for (size_t i = 0; i < num_keys; i++) {
if (statuses[i].ok()) {
values_list[i] = CopyString(values[i]);
values_list_sizes[i] = values[i].size();
errs[i] = nullptr;
} else {
values_list[i] = nullptr;
values_list_sizes[i] = 0;
if (!statuses[i].IsNotFound()) {
errs[i] = strdup(statuses[i].ToString().c_str());
} else {
errs[i] = nullptr;
}
}
}
}
void rocksdb_transaction_multi_get_cf(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
const rocksdb_column_family_handle_t* const* column_families,
size_t num_keys, const char* const* keys_list,
const size_t* keys_list_sizes, char** values_list,
size_t* values_list_sizes, char** errs) {
std::vector<Slice> keys(num_keys);
std::vector<ColumnFamilyHandle*> cfs(num_keys);
for (size_t i = 0; i < num_keys; i++) {
keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
cfs[i] = column_families[i]->rep;
}
std::vector<std::string> values(num_keys);
std::vector<Status> statuses =
txn->rep->MultiGet(options->rep, cfs, keys, &values);
for (size_t i = 0; i < num_keys; i++) {
if (statuses[i].ok()) {
values_list[i] = CopyString(values[i]);
values_list_sizes[i] = values[i].size();
errs[i] = nullptr;
} else {
values_list[i] = nullptr;
values_list_sizes[i] = 0;
if (!statuses[i].IsNotFound()) {
errs[i] = strdup(statuses[i].ToString().c_str());
} else {
errs[i] = nullptr;
}
}
}
}
// Read a key outside a transaction
char* rocksdb_transactiondb_get(
rocksdb_transactiondb_t* txn_db,
const rocksdb_readoptions_t* options,
const char* key, size_t klen,
size_t* vlen,
char** errptr){
char* result = nullptr;
std::string tmp;
Status s = txn_db->rep->Get(options->rep, Slice(key, klen), &tmp);
if (s.ok()) {
*vlen = tmp.size();
result = CopyString(tmp);
} else {
*vlen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
rocksdb_pinnableslice_t* rocksdb_transactiondb_get_pinned(
rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
const char* key, size_t klen, char** errptr) {
rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
Status s = txn_db->rep->Get(options->rep, txn_db->rep->DefaultColumnFamily(),
Slice(key, klen), &v->rep);
if (!s.ok()) {
delete (v);
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
return nullptr;
}
return v;
}
char* rocksdb_transactiondb_get_cf(
rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family, const char* key,
size_t keylen, size_t* vallen, char** errptr) {
char* result = nullptr;
std::string tmp;
Status s = txn_db->rep->Get(options->rep, column_family->rep,
Slice(key, keylen), &tmp);
if (s.ok()) {
*vallen = tmp.size();
result = CopyString(tmp);
} else {
*vallen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
rocksdb_pinnableslice_t* rocksdb_transactiondb_get_pinned_cf(
rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family, const char* key,
size_t keylen, char** errptr) {
rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
Status s = txn_db->rep->Get(options->rep, column_family->rep,
Slice(key, keylen), &v->rep);
if (!s.ok()) {
delete (v);
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
return nullptr;
}
return v;
}
void rocksdb_transactiondb_multi_get(rocksdb_transactiondb_t* txn_db,
const rocksdb_readoptions_t* options,
size_t num_keys,
const char* const* keys_list,
const size_t* keys_list_sizes,
char** values_list,
size_t* values_list_sizes, char** errs) {
std::vector<Slice> keys(num_keys);
for (size_t i = 0; i < num_keys; i++) {
keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<std::string> values(num_keys);
std::vector<Status> statuses =
txn_db->rep->MultiGet(options->rep, keys, &values);
for (size_t i = 0; i < num_keys; i++) {
if (statuses[i].ok()) {
values_list[i] = CopyString(values[i]);
values_list_sizes[i] = values[i].size();
errs[i] = nullptr;
} else {
values_list[i] = nullptr;
values_list_sizes[i] = 0;
if (!statuses[i].IsNotFound()) {
errs[i] = strdup(statuses[i].ToString().c_str());
} else {
errs[i] = nullptr;
}
}
}
}
void rocksdb_transactiondb_multi_get_cf(
rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
const rocksdb_column_family_handle_t* const* column_families,
size_t num_keys, const char* const* keys_list,
const size_t* keys_list_sizes, char** values_list,
size_t* values_list_sizes, char** errs) {
std::vector<Slice> keys(num_keys);
std::vector<ColumnFamilyHandle*> cfs(num_keys);
for (size_t i = 0; i < num_keys; i++) {
keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
cfs[i] = column_families[i]->rep;
}
std::vector<std::string> values(num_keys);
std::vector<Status> statuses =
txn_db->rep->MultiGet(options->rep, cfs, keys, &values);
for (size_t i = 0; i < num_keys; i++) {
if (statuses[i].ok()) {
values_list[i] = CopyString(values[i]);
values_list_sizes[i] = values[i].size();
errs[i] = nullptr;
} else {
values_list[i] = nullptr;
values_list_sizes[i] = 0;
if (!statuses[i].IsNotFound()) {
errs[i] = strdup(statuses[i].ToString().c_str());
} else {
errs[i] = nullptr;
}
}
}
}
// Put a key inside a transaction
void rocksdb_transaction_put(rocksdb_transaction_t* txn, const char* key,
size_t klen, const char* val, size_t vlen,
char** errptr) {
SaveError(errptr, txn->rep->Put(Slice(key, klen), Slice(val, vlen)));
}
void rocksdb_transaction_put_cf(rocksdb_transaction_t* txn,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen, const char* val,
size_t vlen, char** errptr) {
SaveError(errptr, txn->rep->Put(column_family->rep, Slice(key, klen),
Slice(val, vlen)));
}
void rocksdb_transaction_set_commit_timestamp(rocksdb_transaction_t* txn,
uint64_t commit_timestamp) {
txn->rep->SetCommitTimestamp(commit_timestamp);
}
void rocksdb_transaction_set_read_timestamp_for_validation(
rocksdb_transaction_t* txn, uint64_t read_timestamp) {
txn->rep->SetReadTimestampForValidation(read_timestamp);
}
// Put a key outside a transaction
void rocksdb_transactiondb_put(rocksdb_transactiondb_t* txn_db,
const rocksdb_writeoptions_t* options,
const char* key, size_t klen, const char* val,
size_t vlen, char** errptr) {
SaveError(errptr,
txn_db->rep->Put(options->rep, Slice(key, klen), Slice(val, vlen)));
}
void rocksdb_transactiondb_put_cf(rocksdb_transactiondb_t* txn_db,
const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr) {
SaveError(errptr, txn_db->rep->Put(options->rep, column_family->rep,
Slice(key, keylen), Slice(val, vallen)));
}
// Write batch into transaction db
void rocksdb_transactiondb_write(
rocksdb_transactiondb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_writebatch_t* batch,
char** errptr) {
SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
}
// Merge a key inside a transaction
void rocksdb_transaction_merge(rocksdb_transaction_t* txn, const char* key,
size_t klen, const char* val, size_t vlen,
char** errptr) {
SaveError(errptr, txn->rep->Merge(Slice(key, klen), Slice(val, vlen)));
}
void rocksdb_transaction_merge_cf(rocksdb_transaction_t* txn,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen, const char* val,
size_t vlen, char** errptr) {
SaveError(errptr, txn->rep->Merge(column_family->rep, Slice(key, klen),
Slice(val, vlen)));
}
// Merge a key outside a transaction
void rocksdb_transactiondb_merge(rocksdb_transactiondb_t* txn_db,
const rocksdb_writeoptions_t* options,
const char* key, size_t klen, const char* val,
size_t vlen, char** errptr) {
SaveError(errptr, txn_db->rep->Merge(options->rep, Slice(key, klen),
Slice(val, vlen)));
}
void rocksdb_transactiondb_merge_cf(
rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
const char* val, size_t vlen, char** errptr) {
SaveError(errptr, txn_db->rep->Merge(options->rep, column_family->rep,
Slice(key, klen), Slice(val, vlen)));
}
// Delete a key inside a transaction
void rocksdb_transaction_delete(rocksdb_transaction_t* txn, const char* key,
size_t klen, char** errptr) {
SaveError(errptr, txn->rep->Delete(Slice(key, klen)));
}
void rocksdb_transaction_delete_cf(
rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen, char** errptr) {
SaveError(errptr, txn->rep->Delete(column_family->rep, Slice(key, klen)));
}
// Delete a key outside a transaction
void rocksdb_transactiondb_delete(rocksdb_transactiondb_t* txn_db,
const rocksdb_writeoptions_t* options,
const char* key, size_t klen, char** errptr) {
SaveError(errptr, txn_db->rep->Delete(options->rep, Slice(key, klen)));
}
void rocksdb_transactiondb_delete_cf(
rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family, const char* key,
size_t keylen, char** errptr) {
SaveError(errptr, txn_db->rep->Delete(options->rep, column_family->rep,
Slice(key, keylen)));
}
// Create an iterator inside a transaction
rocksdb_iterator_t* rocksdb_transaction_create_iterator(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options) {
rocksdb_iterator_t* result = new rocksdb_iterator_t;
result->rep = txn->rep->GetIterator(options->rep);
return result;
}
// Create an iterator inside a transaction with column family
rocksdb_iterator_t* rocksdb_transaction_create_iterator_cf(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family) {
rocksdb_iterator_t* result = new rocksdb_iterator_t;
result->rep = txn->rep->GetIterator(options->rep, column_family->rep);
return result;
}
// Create an iterator outside a transaction
rocksdb_iterator_t* rocksdb_transactiondb_create_iterator(
rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options) {
rocksdb_iterator_t* result = new rocksdb_iterator_t;
result->rep = txn_db->rep->NewIterator(options->rep);
return result;
}
rocksdb_iterator_t* rocksdb_transactiondb_create_iterator_cf(
rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family) {
rocksdb_iterator_t* result = new rocksdb_iterator_t;
result->rep = txn_db->rep->NewIterator(options->rep, column_family->rep);
return result;
}
void rocksdb_transactiondb_close(rocksdb_transactiondb_t* txn_db) {
delete txn_db->rep;
delete txn_db;
}
void rocksdb_transactiondb_flush_wal(rocksdb_transactiondb_t* txn_db,
unsigned char sync, char** errptr) {
SaveError(errptr, txn_db->rep->FlushWAL(sync));
}
void rocksdb_transactiondb_flush(rocksdb_transactiondb_t* txn_db,
const rocksdb_flushoptions_t* options,
char** errptr) {
SaveError(errptr, txn_db->rep->Flush(options->rep));
}
void rocksdb_transactiondb_flush_cf(
rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
rocksdb_column_family_handle_t* column_family, char** errptr) {
SaveError(errptr, txn_db->rep->Flush(options->rep, column_family->rep));
}
rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create(
rocksdb_transactiondb_t* txn_db, char** errptr) {
Checkpoint* checkpoint;
if (SaveError(errptr, Checkpoint::Create(txn_db->rep, &checkpoint))) {
return nullptr;
}
rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
result->rep = checkpoint;
return result;
}
rocksdb_optimistictransactiondb_t* rocksdb_optimistictransactiondb_open(
const rocksdb_options_t* options, const char* name, char** errptr) {
OptimisticTransactionDB* otxn_db;
if (SaveError(errptr, OptimisticTransactionDB::Open(
options->rep, std::string(name), &otxn_db))) {
return nullptr;
}
rocksdb_optimistictransactiondb_t* result =
new rocksdb_optimistictransactiondb_t;
result->rep = otxn_db;
return result;
}
rocksdb_optimistictransactiondb_t*
rocksdb_optimistictransactiondb_open_column_families(
const rocksdb_options_t* db_options, const char* name,
int num_column_families, const char* const* column_family_names,
const rocksdb_options_t* const* column_family_options,
rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
std::vector<ColumnFamilyDescriptor> column_families;
for (int i = 0; i < num_column_families; i++) {
column_families.push_back(ColumnFamilyDescriptor(
std::string(column_family_names[i]),
ColumnFamilyOptions(column_family_options[i]->rep)));
}
OptimisticTransactionDB* otxn_db;
std::vector<ColumnFamilyHandle*> handles;
if (SaveError(errptr, OptimisticTransactionDB::Open(
DBOptions(db_options->rep), std::string(name),
column_families, &handles, &otxn_db))) {
return nullptr;
}
for (size_t i = 0; i < handles.size(); i++) {
rocksdb_column_family_handle_t* c_handle =
new rocksdb_column_family_handle_t;
c_handle->rep = handles[i];
column_family_handles[i] = c_handle;
}
rocksdb_optimistictransactiondb_t* result =
new rocksdb_optimistictransactiondb_t;
result->rep = otxn_db;
return result;
}
rocksdb_t* rocksdb_optimistictransactiondb_get_base_db(
rocksdb_optimistictransactiondb_t* otxn_db) {
DB* base_db = otxn_db->rep->GetBaseDB();
if (base_db != nullptr) {
rocksdb_t* result = new rocksdb_t;
result->rep = base_db;
return result;
}
return nullptr;
}
void rocksdb_optimistictransactiondb_close_base_db(rocksdb_t* base_db) {
delete base_db;
}
rocksdb_transaction_t* rocksdb_optimistictransaction_begin(
rocksdb_optimistictransactiondb_t* otxn_db,
const rocksdb_writeoptions_t* write_options,
const rocksdb_optimistictransaction_options_t* otxn_options,
rocksdb_transaction_t* old_txn) {
if (old_txn == nullptr) {
rocksdb_transaction_t* result = new rocksdb_transaction_t;
result->rep = otxn_db->rep->BeginTransaction(write_options->rep,
otxn_options->rep, nullptr);
return result;
}
old_txn->rep = otxn_db->rep->BeginTransaction(
write_options->rep, otxn_options->rep, old_txn->rep);
return old_txn;
}
// Write batch into OptimisticTransactionDB
void rocksdb_optimistictransactiondb_write(
rocksdb_optimistictransactiondb_t* otxn_db,
const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch,
char** errptr) {
SaveError(errptr, otxn_db->rep->Write(options->rep, &batch->rep));
}
void rocksdb_optimistictransactiondb_close(
rocksdb_optimistictransactiondb_t* otxn_db) {
delete otxn_db->rep;
delete otxn_db;
}
rocksdb_checkpoint_t* rocksdb_optimistictransactiondb_checkpoint_object_create(
rocksdb_optimistictransactiondb_t* otxn_db, char** errptr) {
Checkpoint* checkpoint;
if (SaveError(errptr, Checkpoint::Create(otxn_db->rep, &checkpoint))) {
return nullptr;
}
rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
result->rep = checkpoint;
return result;
}
void rocksdb_free(void* ptr) { free(ptr); }
rocksdb_pinnableslice_t* rocksdb_get_pinned(
rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
size_t keylen, char** errptr) {
rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(),
Slice(key, keylen), &v->rep);
if (!s.ok()) {
delete (v);
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
return nullptr;
}
return v;
}
rocksdb_pinnableslice_t* rocksdb_get_pinned_cf(
rocksdb_t* db, const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family, const char* key,
size_t keylen, char** errptr) {
rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
&v->rep);
if (!s.ok()) {
delete v;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
return nullptr;
}
return v;
}
void rocksdb_pinnableslice_destroy(rocksdb_pinnableslice_t* v) { delete v; }
const char* rocksdb_pinnableslice_value(const rocksdb_pinnableslice_t* v,
size_t* vlen) {
if (!v) {
*vlen = 0;
return nullptr;
}
*vlen = v->rep.size();
return v->rep.data();
}
// container to keep databases and caches in order to use
// ROCKSDB_NAMESPACE::MemoryUtil
struct rocksdb_memory_consumers_t {
std::vector<rocksdb_t*> dbs;
std::unordered_set<rocksdb_cache_t*> caches;
};
// initializes new container of memory consumers
rocksdb_memory_consumers_t* rocksdb_memory_consumers_create() {
return new rocksdb_memory_consumers_t;
}
// adds datatabase to the container of memory consumers
void rocksdb_memory_consumers_add_db(rocksdb_memory_consumers_t* consumers,
rocksdb_t* db) {
consumers->dbs.push_back(db);
}
// adds cache to the container of memory consumers
void rocksdb_memory_consumers_add_cache(rocksdb_memory_consumers_t* consumers,
rocksdb_cache_t* cache) {
consumers->caches.insert(cache);
}
// deletes container with memory consumers
void rocksdb_memory_consumers_destroy(rocksdb_memory_consumers_t* consumers) {
delete consumers;
}
// contains memory usage statistics provided by ROCKSDB_NAMESPACE::MemoryUtil
struct rocksdb_memory_usage_t {
uint64_t mem_table_total;
uint64_t mem_table_unflushed;
uint64_t mem_table_readers_total;
uint64_t cache_total;
};
// estimates amount of memory occupied by consumers (dbs and caches)
rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create(
rocksdb_memory_consumers_t* consumers, char** errptr) {
vector<DB*> dbs;
for (auto db : consumers->dbs) {
dbs.push_back(db->rep);
}
unordered_set<const Cache*> cache_set;
for (auto cache : consumers->caches) {
cache_set.insert(const_cast<const Cache*>(cache->rep.get()));
}
std::map<ROCKSDB_NAMESPACE::MemoryUtil::UsageType, uint64_t> usage_by_type;
auto status = MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
&usage_by_type);
if (SaveError(errptr, status)) {
return nullptr;
}
auto result = new rocksdb_memory_usage_t;
result->mem_table_total = usage_by_type[MemoryUtil::kMemTableTotal];
result->mem_table_unflushed = usage_by_type[MemoryUtil::kMemTableUnFlushed];
result->mem_table_readers_total = usage_by_type[MemoryUtil::kTableReadersTotal];
result->cache_total = usage_by_type[MemoryUtil::kCacheTotal];
return result;
}
uint64_t rocksdb_approximate_memory_usage_get_mem_table_total(
rocksdb_memory_usage_t* memory_usage) {
return memory_usage->mem_table_total;
}
uint64_t rocksdb_approximate_memory_usage_get_mem_table_unflushed(
rocksdb_memory_usage_t* memory_usage) {
return memory_usage->mem_table_unflushed;
}
uint64_t rocksdb_approximate_memory_usage_get_mem_table_readers_total(
rocksdb_memory_usage_t* memory_usage) {
return memory_usage->mem_table_readers_total;
}
uint64_t rocksdb_approximate_memory_usage_get_cache_total(
rocksdb_memory_usage_t* memory_usage) {
return memory_usage->cache_total;
}
void rocksdb_options_set_dump_malloc_stats(rocksdb_options_t* opt,
unsigned char val) {
opt->rep.dump_malloc_stats = val;
}
void rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t* opt,
unsigned char val) {
opt->rep.memtable_whole_key_filtering = val;
}
// deletes container with memory usage estimates
void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) {
delete usage;
}
void rocksdb_cancel_all_background_work(rocksdb_t* db, unsigned char wait) {
CancelAllBackgroundWork(db->rep, wait);
}
void rocksdb_disable_manual_compaction(rocksdb_t* db) {
db->rep->DisableManualCompaction();
}
void rocksdb_enable_manual_compaction(rocksdb_t* db) {
db->rep->EnableManualCompaction();
}
} // end extern "C"
#endif // !ROCKSDB_LITE