Merge branch 'master' into performance

Conflicts:
	db/db_impl.cc
	db/db_test.cc
	db/memtable.cc
	db/version_set.cc
	include/rocksdb/statistics.h
main
kailiu 11 years ago
commit f1cec73a76
  1. 17
      Makefile
  2. 2
      README
  3. 12
      build_tools/build_detect_platform
  4. 475
      db/c.cc
  5. 218
      db/c_test.c
  6. 8
      db/db_bench.cc
  7. 348
      db/db_impl.cc
  8. 95
      db/db_impl.h
  9. 182
      db/db_test.cc
  10. 2
      db/memtable.cc
  11. 10
      db/table_cache.cc
  12. 16
      db/version_set.cc
  13. 8
      db/version_set.h
  14. 296
      include/rocksdb/c.h
  15. 4
      include/rocksdb/db.h
  16. 100
      include/rocksdb/statistics.h
  17. 29
      include/rocksdb/status.h
  18. 2
      include/rocksdb/transaction_log.h
  19. 133
      include/utilities/backupable_db.h
  20. 4
      include/utilities/stackable_db.h
  21. 32
      port/stack_trace.cc
  22. 329
      util/autovector.h
  23. 290
      util/autovector_test.cc
  24. 24
      util/cache.cc
  25. 11
      util/coding.cc
  26. 2
      util/coding.h
  27. 10
      util/env_posix.cc
  28. 12
      util/posix_logger.h
  29. 3
      util/stack_trace.h
  30. 87
      util/status.cc
  31. 2
      util/testharness.h
  32. BIN
      utilities/.DS_Store
  33. 874
      utilities/backupable/backupable_db.cc
  34. 668
      utilities/backupable/backupable_db_test.cc
  35. BIN
      utilities/merge_operators/.DS_Store
  36. 7
      utilities/merge_operators/string_append/stringappend_test.cc
  37. 147
      utilities/ttl/db_ttl.cc
  38. 81
      utilities/ttl/db_ttl.h

@ -45,6 +45,8 @@ VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
TESTS = \
autovector_test \
db_test \
table_properties_collector_test \
arena_test \
auto_roll_logger_test \
@ -74,12 +76,12 @@ TESTS = \
skiplist_test \
stringappend_test \
ttl_test \
backupable_db_test \
version_edit_test \
version_set_test \
write_batch_test\
deletefile_test \
table_test \
db_test
table_test
TOOLS = \
sst_dump \
@ -125,7 +127,7 @@ $(SHARED2): $(SHARED3)
endif
$(SHARED3):
$(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $(SHARED3)
$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $@ $(LDFLAGS)
endif # PLATFORM_SHARED_EXT
@ -145,8 +147,9 @@ coverage:
# Delete intermediate files
find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
check: all $(PROGRAMS) $(TESTS) $(TOOLS) ldb_tests
check: all $(PROGRAMS) $(TESTS) $(TOOLS)
for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
python tools/ldb_test.py
ldb_tests: all $(PROGRAMS) $(TOOLS)
python tools/ldb_test.py
@ -223,6 +226,9 @@ signal_test: util/signal_test.o $(LIBOBJECTS)
arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -277,6 +283,9 @@ perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

@ -79,4 +79,4 @@ include/rocksdb/statistics.h
include/rocksdb/transaction_log.h
An API to retrieve transaction logs from a database.
Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/

@ -189,6 +189,18 @@ EOF
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT"
fi
# Test whether fallocate is available
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <fcntl.h>
int main() {
int fd = open("/dev/null", 0);
fallocate(fd, 0, 0, 1024);
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$PLATFORM_LDFLAGS -DROCKSDB_FALLOCATE_PRESENT"
fi
# Test whether Snappy library is installed
# http://code.google.com/p/snappy/
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF

@ -20,6 +20,8 @@
#include "rocksdb/options.h"
#include "rocksdb/status.h"
#include "rocksdb/write_batch.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/universal_compaction.h"
using rocksdb::Cache;
using rocksdb::Comparator;
@ -48,21 +50,21 @@ using std::shared_ptr;
extern "C" {
struct leveldb_t { DB* rep; };
struct leveldb_iterator_t { Iterator* rep; };
struct leveldb_writebatch_t { WriteBatch rep; };
struct leveldb_snapshot_t { const Snapshot* rep; };
struct leveldb_readoptions_t { ReadOptions rep; };
struct leveldb_writeoptions_t { WriteOptions rep; };
struct leveldb_options_t { Options rep; };
struct leveldb_seqfile_t { SequentialFile* rep; };
struct leveldb_randomfile_t { RandomAccessFile* rep; };
struct leveldb_writablefile_t { WritableFile* rep; };
struct leveldb_filelock_t { FileLock* rep; };
struct leveldb_logger_t { shared_ptr<Logger> rep; };
struct leveldb_cache_t { shared_ptr<Cache> rep; };
struct leveldb_comparator_t : public Comparator {
struct rocksdb_t { DB* rep; };
struct rocksdb_iterator_t { Iterator* rep; };
struct rocksdb_writebatch_t { WriteBatch rep; };
struct rocksdb_snapshot_t { const Snapshot* rep; };
struct rocksdb_readoptions_t { ReadOptions rep; };
struct rocksdb_writeoptions_t { WriteOptions rep; };
struct rocksdb_options_t { Options rep; };
struct rocksdb_seqfile_t { SequentialFile* rep; };
struct rocksdb_randomfile_t { RandomAccessFile* rep; };
struct rocksdb_writablefile_t { WritableFile* rep; };
struct rocksdb_filelock_t { FileLock* rep; };
struct rocksdb_logger_t { shared_ptr<Logger> rep; };
struct rocksdb_cache_t { shared_ptr<Cache> rep; };
struct rocksdb_comparator_t : public Comparator {
void* state_;
void (*destructor_)(void*);
int (*compare_)(
@ -71,7 +73,7 @@ struct leveldb_comparator_t : public Comparator {
const char* b, size_t blen);
const char* (*name_)(void*);
virtual ~leveldb_comparator_t() {
virtual ~rocksdb_comparator_t() {
(*destructor_)(state_);
}
@ -88,7 +90,7 @@ struct leveldb_comparator_t : public Comparator {
virtual void FindShortSuccessor(std::string* key) const { }
};
struct leveldb_filterpolicy_t : public FilterPolicy {
struct rocksdb_filterpolicy_t : public FilterPolicy {
void* state_;
void (*destructor_)(void*);
const char* (*name_)(void*);
@ -102,7 +104,7 @@ struct leveldb_filterpolicy_t : public FilterPolicy {
const char* key, size_t length,
const char* filter, size_t filter_length);
virtual ~leveldb_filterpolicy_t() {
virtual ~rocksdb_filterpolicy_t() {
(*destructor_)(state_);
}
@ -129,11 +131,16 @@ struct leveldb_filterpolicy_t : public FilterPolicy {
}
};
struct leveldb_env_t {
struct rocksdb_env_t {
Env* rep;
bool is_default;
};
struct rocksdb_universal_compaction_options_t {
rocksdb::CompactionOptionsUniversal *rep;
};
static bool SaveError(char** errptr, const Status& s) {
assert(errptr != NULL);
if (s.ok()) {
@ -154,27 +161,27 @@ static char* CopyString(const std::string& str) {
return result;
}
leveldb_t* leveldb_open(
const leveldb_options_t* options,
rocksdb_t* rocksdb_open(
const rocksdb_options_t* options,
const char* name,
char** errptr) {
DB* db;
if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
return NULL;
}
leveldb_t* result = new leveldb_t;
rocksdb_t* result = new rocksdb_t;
result->rep = db;
return result;
}
void leveldb_close(leveldb_t* db) {
void rocksdb_close(rocksdb_t* db) {
delete db->rep;
delete db;
}
void leveldb_put(
leveldb_t* db,
const leveldb_writeoptions_t* options,
void rocksdb_put(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr) {
@ -182,26 +189,26 @@ void leveldb_put(
db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
}
void leveldb_delete(
leveldb_t* db,
const leveldb_writeoptions_t* options,
void rocksdb_delete(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
const char* key, size_t keylen,
char** errptr) {
SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
}
void leveldb_write(
leveldb_t* db,
const leveldb_writeoptions_t* options,
leveldb_writebatch_t* batch,
void rocksdb_write(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_writebatch_t* batch,
char** errptr) {
SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
}
char* leveldb_get(
leveldb_t* db,
const leveldb_readoptions_t* options,
char* rocksdb_get(
rocksdb_t* db,
const rocksdb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr) {
@ -220,30 +227,30 @@ char* leveldb_get(
return result;
}
leveldb_iterator_t* leveldb_create_iterator(
leveldb_t* db,
const leveldb_readoptions_t* options) {
leveldb_iterator_t* result = new leveldb_iterator_t;
rocksdb_iterator_t* rocksdb_create_iterator(
rocksdb_t* db,
const rocksdb_readoptions_t* options) {
rocksdb_iterator_t* result = new rocksdb_iterator_t;
result->rep = db->rep->NewIterator(options->rep);
return result;
}
const leveldb_snapshot_t* leveldb_create_snapshot(
leveldb_t* db) {
leveldb_snapshot_t* result = new leveldb_snapshot_t;
const rocksdb_snapshot_t* rocksdb_create_snapshot(
rocksdb_t* db) {
rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
result->rep = db->rep->GetSnapshot();
return result;
}
void leveldb_release_snapshot(
leveldb_t* db,
const leveldb_snapshot_t* snapshot) {
void rocksdb_release_snapshot(
rocksdb_t* db,
const rocksdb_snapshot_t* snapshot) {
db->rep->ReleaseSnapshot(snapshot->rep);
delete snapshot;
}
char* leveldb_property_value(
leveldb_t* db,
char* rocksdb_property_value(
rocksdb_t* db,
const char* propname) {
std::string tmp;
if (db->rep->GetProperty(Slice(propname), &tmp)) {
@ -254,8 +261,8 @@ char* leveldb_property_value(
}
}
void leveldb_approximate_sizes(
leveldb_t* db,
void rocksdb_approximate_sizes(
rocksdb_t* db,
int num_ranges,
const char* const* range_start_key, const size_t* range_start_key_len,
const char* const* range_limit_key, const size_t* range_limit_key_len,
@ -269,8 +276,8 @@ void leveldb_approximate_sizes(
delete[] ranges;
}
void leveldb_compact_range(
leveldb_t* db,
void rocksdb_compact_range(
rocksdb_t* db,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len) {
Slice a, b;
@ -280,92 +287,92 @@ void leveldb_compact_range(
(limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL));
}
void leveldb_destroy_db(
const leveldb_options_t* options,
void rocksdb_destroy_db(
const rocksdb_options_t* options,
const char* name,
char** errptr) {
SaveError(errptr, DestroyDB(name, options->rep));
}
void leveldb_repair_db(
const leveldb_options_t* options,
void rocksdb_repair_db(
const rocksdb_options_t* options,
const char* name,
char** errptr) {
SaveError(errptr, RepairDB(name, options->rep));
}
void leveldb_iter_destroy(leveldb_iterator_t* iter) {
void rocksdb_iter_destroy(rocksdb_iterator_t* iter) {
delete iter->rep;
delete iter;
}
unsigned char leveldb_iter_valid(const leveldb_iterator_t* iter) {
unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) {
return iter->rep->Valid();
}
void leveldb_iter_seek_to_first(leveldb_iterator_t* iter) {
void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) {
iter->rep->SeekToFirst();
}
void leveldb_iter_seek_to_last(leveldb_iterator_t* iter) {
void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) {
iter->rep->SeekToLast();
}
void leveldb_iter_seek(leveldb_iterator_t* iter, const char* k, size_t klen) {
void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) {
iter->rep->Seek(Slice(k, klen));
}
void leveldb_iter_next(leveldb_iterator_t* iter) {
void rocksdb_iter_next(rocksdb_iterator_t* iter) {
iter->rep->Next();
}
void leveldb_iter_prev(leveldb_iterator_t* iter) {
void rocksdb_iter_prev(rocksdb_iterator_t* iter) {
iter->rep->Prev();
}
const char* leveldb_iter_key(const leveldb_iterator_t* iter, size_t* klen) {
const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) {
Slice s = iter->rep->key();
*klen = s.size();
return s.data();
}
const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) {
Slice s = iter->rep->value();
*vlen = s.size();
return s.data();
}
void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) {
SaveError(errptr, iter->rep->status());
}
leveldb_writebatch_t* leveldb_writebatch_create() {
return new leveldb_writebatch_t;
rocksdb_writebatch_t* rocksdb_writebatch_create() {
return new rocksdb_writebatch_t;
}
void leveldb_writebatch_destroy(leveldb_writebatch_t* b) {
void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) {
delete b;
}
void leveldb_writebatch_clear(leveldb_writebatch_t* b) {
void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) {
b->rep.Clear();
}
void leveldb_writebatch_put(
leveldb_writebatch_t* b,
void rocksdb_writebatch_put(
rocksdb_writebatch_t* b,
const char* key, size_t klen,
const char* val, size_t vlen) {
b->rep.Put(Slice(key, klen), Slice(val, vlen));
}
void leveldb_writebatch_delete(
leveldb_writebatch_t* b,
void rocksdb_writebatch_delete(
rocksdb_writebatch_t* b,
const char* key, size_t klen) {
b->rep.Delete(Slice(key, klen));
}
void leveldb_writebatch_iterate(
leveldb_writebatch_t* b,
void rocksdb_writebatch_iterate(
rocksdb_writebatch_t* b,
void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
void (*deleted)(void*, const char* k, size_t klen)) {
@ -388,132 +395,132 @@ void leveldb_writebatch_iterate(
b->rep.Iterate(&handler);
}
leveldb_options_t* leveldb_options_create() {
return new leveldb_options_t;
rocksdb_options_t* rocksdb_options_create() {
return new rocksdb_options_t;
}
void leveldb_options_destroy(leveldb_options_t* options) {
void rocksdb_options_destroy(rocksdb_options_t* options) {
delete options;
}
void leveldb_options_set_comparator(
leveldb_options_t* opt,
leveldb_comparator_t* cmp) {
void rocksdb_options_set_comparator(
rocksdb_options_t* opt,
rocksdb_comparator_t* cmp) {
opt->rep.comparator = cmp;
}
void leveldb_options_set_filter_policy(
leveldb_options_t* opt,
leveldb_filterpolicy_t* policy) {
void rocksdb_options_set_filter_policy(
rocksdb_options_t* opt,
rocksdb_filterpolicy_t* policy) {
opt->rep.filter_policy = policy;
}
void leveldb_options_set_create_if_missing(
leveldb_options_t* opt, unsigned char v) {
void rocksdb_options_set_create_if_missing(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.create_if_missing = v;
}
void leveldb_options_set_error_if_exists(
leveldb_options_t* opt, unsigned char v) {
void rocksdb_options_set_error_if_exists(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.error_if_exists = v;
}
void leveldb_options_set_paranoid_checks(
leveldb_options_t* opt, unsigned char v) {
void rocksdb_options_set_paranoid_checks(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.paranoid_checks = v;
}
void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) {
opt->rep.env = (env ? env->rep : NULL);
}
void leveldb_options_set_info_log(leveldb_options_t* opt, leveldb_logger_t* l) {
void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) {
if (l) {
opt->rep.info_log = l->rep;
}
}
void leveldb_options_set_write_buffer_size(leveldb_options_t* opt, size_t s) {
void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
opt->rep.write_buffer_size = s;
}
void leveldb_options_set_max_open_files(leveldb_options_t* opt, int n) {
void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
opt->rep.max_open_files = n;
}
void leveldb_options_set_cache(leveldb_options_t* opt, leveldb_cache_t* c) {
void rocksdb_options_set_cache(rocksdb_options_t* opt, rocksdb_cache_t* c) {
if (c) {
opt->rep.block_cache = c->rep;
}
}
void leveldb_options_set_block_size(leveldb_options_t* opt, size_t s) {
void rocksdb_options_set_block_size(rocksdb_options_t* opt, size_t s) {
opt->rep.block_size = s;
}
void leveldb_options_set_block_restart_interval(leveldb_options_t* opt, int n) {
void rocksdb_options_set_block_restart_interval(rocksdb_options_t* opt, int n) {
opt->rep.block_restart_interval = n;
}
void leveldb_options_set_target_file_size_base(
leveldb_options_t* opt, uint64_t n) {
void rocksdb_options_set_target_file_size_base(
rocksdb_options_t* opt, uint64_t n) {
opt->rep.target_file_size_base = n;
}
void leveldb_options_set_target_file_size_multiplier(
leveldb_options_t* opt, int n) {
void rocksdb_options_set_target_file_size_multiplier(
rocksdb_options_t* opt, int n) {
opt->rep.target_file_size_multiplier = n;
}
void leveldb_options_set_max_bytes_for_level_base(
leveldb_options_t* opt, uint64_t n) {
void rocksdb_options_set_max_bytes_for_level_base(
rocksdb_options_t* opt, uint64_t n) {
opt->rep.max_bytes_for_level_base = n;
}
void leveldb_options_set_max_bytes_for_level_multiplier(
leveldb_options_t* opt, int n) {
void rocksdb_options_set_max_bytes_for_level_multiplier(
rocksdb_options_t* opt, int n) {
opt->rep.max_bytes_for_level_multiplier = n;
}
void leveldb_options_set_expanded_compaction_factor(
leveldb_options_t* opt, int n) {
void rocksdb_options_set_expanded_compaction_factor(
rocksdb_options_t* opt, int n) {
opt->rep.expanded_compaction_factor = n;
}
void leveldb_options_set_max_grandparent_overlap_factor(
leveldb_options_t* opt, int n) {
void rocksdb_options_set_max_grandparent_overlap_factor(
rocksdb_options_t* opt, int n) {
opt->rep.max_grandparent_overlap_factor = n;
}
void leveldb_options_set_num_levels(leveldb_options_t* opt, int n) {
void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
opt->rep.num_levels = n;
}
void leveldb_options_set_level0_file_num_compaction_trigger(
leveldb_options_t* opt, int n) {
void rocksdb_options_set_level0_file_num_compaction_trigger(
rocksdb_options_t* opt, int n) {
opt->rep.level0_file_num_compaction_trigger = n;
}
void leveldb_options_set_level0_slowdown_writes_trigger(
leveldb_options_t* opt, int n) {
void rocksdb_options_set_level0_slowdown_writes_trigger(
rocksdb_options_t* opt, int n) {
opt->rep.level0_slowdown_writes_trigger = n;
}
void leveldb_options_set_level0_stop_writes_trigger(
leveldb_options_t* opt, int n) {
void rocksdb_options_set_level0_stop_writes_trigger(
rocksdb_options_t* opt, int n) {
opt->rep.level0_stop_writes_trigger = n;
}
void leveldb_options_set_max_mem_compaction_level(
leveldb_options_t* opt, int n) {
void rocksdb_options_set_max_mem_compaction_level(
rocksdb_options_t* opt, int n) {
opt->rep.max_mem_compaction_level = n;
}
void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) {
opt->rep.compression = static_cast<CompressionType>(t);
}
void leveldb_options_set_compression_per_level(leveldb_options_t* opt,
void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt,
int* level_values,
size_t num_levels) {
opt->rep.compression_per_level.resize(num_levels);
@ -523,43 +530,132 @@ void leveldb_options_set_compression_per_level(leveldb_options_t* opt,
}
}
void leveldb_options_set_compression_options(
leveldb_options_t* opt, int w_bits, int level, int strategy) {
void rocksdb_options_set_compression_options(
rocksdb_options_t* opt, int w_bits, int level, int strategy) {
opt->rep.compression_opts.window_bits = w_bits;
opt->rep.compression_opts.level = level;
opt->rep.compression_opts.strategy = strategy;
}
void leveldb_options_set_disable_data_sync(
leveldb_options_t* opt, bool disable_data_sync) {
void rocksdb_options_set_disable_data_sync(
rocksdb_options_t* opt, int disable_data_sync) {
opt->rep.disableDataSync = disable_data_sync;
}
void leveldb_options_set_use_fsync(
leveldb_options_t* opt, bool use_fsync) {
void rocksdb_options_set_use_fsync(
rocksdb_options_t* opt, int use_fsync) {
opt->rep.use_fsync = use_fsync;
}
void leveldb_options_set_db_stats_log_interval(
leveldb_options_t* opt, int db_stats_log_interval) {
void rocksdb_options_set_db_stats_log_interval(
rocksdb_options_t* opt, int db_stats_log_interval) {
opt->rep.db_stats_log_interval = db_stats_log_interval;
}
void leveldb_options_set_db_log_dir(
leveldb_options_t* opt, const char* db_log_dir) {
void rocksdb_options_set_db_log_dir(
rocksdb_options_t* opt, const char* db_log_dir) {
opt->rep.db_log_dir = db_log_dir;
}
void leveldb_options_set_WAL_ttl_seconds(leveldb_options_t* opt, uint64_t ttl) {
void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) {
opt->rep.WAL_ttl_seconds = ttl;
}
void leveldb_options_set_WAL_size_limit_MB(
leveldb_options_t* opt, uint64_t limit) {
void rocksdb_options_set_WAL_size_limit_MB(
rocksdb_options_t* opt, uint64_t limit) {
opt->rep.WAL_size_limit_MB = limit;
}
leveldb_comparator_t* leveldb_comparator_create(
void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) {
opt->rep.max_write_buffer_number = n;
}
void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) {
opt->rep.min_write_buffer_number_to_merge = n;
}
void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) {
opt->rep.max_background_compactions = n;
}
void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) {
opt->rep.max_background_flushes = n;
}
void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) {
opt->rep.disable_auto_compactions = disable;
}
void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t* opt, int disable) {
opt->rep.disable_seek_compaction = disable;
}
void rocksdb_options_set_source_compaction_factor(
rocksdb_options_t* opt, int n) {
opt->rep.expanded_compaction_factor = n;
}
void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) {
opt->rep.PrepareForBulkLoad();
}
void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) {
static rocksdb::VectorRepFactory* factory = 0;
if (!factory) {
factory = new rocksdb::VectorRepFactory;
}
opt->rep.memtable_factory.reset(factory);
}
void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) {
opt->rep.compaction_style = static_cast<rocksdb::CompactionStyle>(style);
}
void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) {
opt->rep.compaction_options_universal = *(uco->rep);
}
/*
TODO:
merge_operator
compaction_filter
prefix_extractor
whole_key_filtering
max_bytes_for_level_multiplier_additional
delete_obsolete_files_period_micros
max_log_file_size
log_file_time_to_roll
keep_log_file_num
soft_rate_limit
hard_rate_limit
rate_limit_delay_max_milliseconds
max_manifest_file_size
no_block_cache
table_cache_numshardbits
table_cache_remove_scan_count_limit
arena_block_size
manifest_preallocation_size
purge_redundant_kvs_while_flush
allow_os_buffer
allow_mmap_reads
allow_mmap_writes
is_fd_close_on_exec
skip_log_error_on_recovery
stats_dump_period_sec
block_size_deviation
advise_random_on_open
access_hint_on_compaction_start
use_adaptive_mutex
bytes_per_sync
filter_deletes
max_sequential_skip_in_iterations
table_factory
table_properties_collectors
inplace_update_support
inplace_update_num_locks
*/
rocksdb_comparator_t* rocksdb_comparator_create(
void* state,
void (*destructor)(void*),
int (*compare)(
@ -567,7 +663,7 @@ leveldb_comparator_t* leveldb_comparator_create(
const char* a, size_t alen,
const char* b, size_t blen),
const char* (*name)(void*)) {
leveldb_comparator_t* result = new leveldb_comparator_t;
rocksdb_comparator_t* result = new rocksdb_comparator_t;
result->state_ = state;
result->destructor_ = destructor;
result->compare_ = compare;
@ -575,11 +671,11 @@ leveldb_comparator_t* leveldb_comparator_create(
return result;
}
void leveldb_comparator_destroy(leveldb_comparator_t* cmp) {
void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) {
delete cmp;
}
leveldb_filterpolicy_t* leveldb_filterpolicy_create(
rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
void* state,
void (*destructor)(void*),
char* (*create_filter)(
@ -592,7 +688,7 @@ leveldb_filterpolicy_t* leveldb_filterpolicy_create(
const char* key, size_t length,
const char* filter, size_t filter_length),
const char* (*name)(void*)) {
leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t;
rocksdb_filterpolicy_t* result = new rocksdb_filterpolicy_t;
result->state_ = state;
result->destructor_ = destructor;
result->create_ = create_filter;
@ -601,15 +697,15 @@ leveldb_filterpolicy_t* leveldb_filterpolicy_create(
return result;
}
void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) {
void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) {
delete filter;
}
leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) {
// Make a leveldb_filterpolicy_t, but override all of its methods so
rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) {
// Make a rocksdb_filterpolicy_t, but override all of its methods so
// they delegate to a NewBloomFilterPolicy() instead of user
// supplied C functions.
struct Wrapper : public leveldb_filterpolicy_t {
struct Wrapper : public rocksdb_filterpolicy_t {
const FilterPolicy* rep_;
~Wrapper() { delete rep_; }
const char* Name() const { return rep_->Name(); }
@ -628,64 +724,115 @@ leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) {
return wrapper;
}
leveldb_readoptions_t* leveldb_readoptions_create() {
return new leveldb_readoptions_t;
rocksdb_readoptions_t* rocksdb_readoptions_create() {
return new rocksdb_readoptions_t;
}
void leveldb_readoptions_destroy(leveldb_readoptions_t* opt) {
void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) {
delete opt;
}
void leveldb_readoptions_set_verify_checksums(
leveldb_readoptions_t* opt,
void rocksdb_readoptions_set_verify_checksums(
rocksdb_readoptions_t* opt,
unsigned char v) {
opt->rep.verify_checksums = v;
}
void leveldb_readoptions_set_fill_cache(
leveldb_readoptions_t* opt, unsigned char v) {
void rocksdb_readoptions_set_fill_cache(
rocksdb_readoptions_t* opt, unsigned char v) {
opt->rep.fill_cache = v;
}
void leveldb_readoptions_set_snapshot(
leveldb_readoptions_t* opt,
const leveldb_snapshot_t* snap) {
void rocksdb_readoptions_set_snapshot(
rocksdb_readoptions_t* opt,
const rocksdb_snapshot_t* snap) {
opt->rep.snapshot = (snap ? snap->rep : NULL);
}
leveldb_writeoptions_t* leveldb_writeoptions_create() {
return new leveldb_writeoptions_t;
rocksdb_writeoptions_t* rocksdb_writeoptions_create() {
return new rocksdb_writeoptions_t;
}
void leveldb_writeoptions_destroy(leveldb_writeoptions_t* opt) {
void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) {
delete opt;
}
void leveldb_writeoptions_set_sync(
leveldb_writeoptions_t* opt, unsigned char v) {
void rocksdb_writeoptions_set_sync(
rocksdb_writeoptions_t* opt, unsigned char v) {
opt->rep.sync = v;
}
leveldb_cache_t* leveldb_cache_create_lru(size_t capacity) {
leveldb_cache_t* c = new leveldb_cache_t;
void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable) {
opt->rep.disableWAL = disable;
}
rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) {
rocksdb_cache_t* c = new rocksdb_cache_t;
c->rep = NewLRUCache(capacity);
return c;
}
void leveldb_cache_destroy(leveldb_cache_t* cache) {
void rocksdb_cache_destroy(rocksdb_cache_t* cache) {
delete cache;
}
leveldb_env_t* leveldb_create_default_env() {
leveldb_env_t* result = new leveldb_env_t;
rocksdb_env_t* rocksdb_create_default_env() {
rocksdb_env_t* result = new rocksdb_env_t;
result->rep = Env::Default();
result->is_default = true;
return result;
}
void leveldb_env_destroy(leveldb_env_t* env) {
void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) {
env->rep->SetBackgroundThreads(n);
}
void rocksdb_env_destroy(rocksdb_env_t* env) {
if (!env->is_default) delete env->rep;
delete env;
}
rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() {
rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t;
result->rep = new rocksdb::CompactionOptionsUniversal;
return result;
}
void rocksdb_universal_compaction_options_set_size_ratio(
rocksdb_universal_compaction_options_t* uco, int ratio) {
uco->rep->size_ratio = ratio;
}
void rocksdb_universal_compaction_options_set_min_merge_width(
rocksdb_universal_compaction_options_t* uco, int w) {
uco->rep->min_merge_width = w;
}
void rocksdb_universal_compaction_options_set_max_merge_width(
rocksdb_universal_compaction_options_t* uco, int w) {
uco->rep->max_merge_width = w;
}
void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
rocksdb_universal_compaction_options_t* uco, int p) {
uco->rep->max_size_amplification_percent = p;
}
void rocksdb_universal_compaction_options_set_compression_size_percent(
rocksdb_universal_compaction_options_t* uco, int p) {
uco->rep->compression_size_percent = p;
}
void rocksdb_universal_compaction_options_set_stop_style(
rocksdb_universal_compaction_options_t* uco, int style) {
uco->rep->stop_style = static_cast<rocksdb::CompactionStopStyle>(style);
}
void rocksdb_universal_compaction_options_destroy(
rocksdb_universal_compaction_options_t* uco) {
delete uco->rep;
delete uco;
}
} // end extern "C"

@ -62,30 +62,30 @@ static void Free(char** ptr) {
}
static void CheckGet(
leveldb_t* db,
const leveldb_readoptions_t* options,
rocksdb_t* db,
const rocksdb_readoptions_t* options,
const char* key,
const char* expected) {
char* err = NULL;
size_t val_len;
char* val;
val = leveldb_get(db, options, key, strlen(key), &val_len, &err);
val = rocksdb_get(db, options, key, strlen(key), &val_len, &err);
CheckNoError(err);
CheckEqual(expected, val, val_len);
Free(&val);
}
static void CheckIter(leveldb_iterator_t* iter,
static void CheckIter(rocksdb_iterator_t* iter,
const char* key, const char* val) {
size_t len;
const char* str;
str = leveldb_iter_key(iter, &len);
str = rocksdb_iter_key(iter, &len);
CheckEqual(key, str, len);
str = leveldb_iter_value(iter, &len);
str = rocksdb_iter_value(iter, &len);
CheckEqual(val, str, len);
}
// Callback from leveldb_writebatch_iterate()
// Callback from rocksdb_writebatch_iterate()
static void CheckPut(void* ptr,
const char* k, size_t klen,
const char* v, size_t vlen) {
@ -104,7 +104,7 @@ static void CheckPut(void* ptr,
(*state)++;
}
// Callback from leveldb_writebatch_iterate()
// Callback from rocksdb_writebatch_iterate()
static void CheckDel(void* ptr, const char* k, size_t klen) {
int* state = (int*) ptr;
CheckCondition(*state == 2);
@ -155,117 +155,117 @@ unsigned char FilterKeyMatch(
}
int main(int argc, char** argv) {
leveldb_t* db;
leveldb_comparator_t* cmp;
leveldb_cache_t* cache;
leveldb_env_t* env;
leveldb_options_t* options;
leveldb_readoptions_t* roptions;
leveldb_writeoptions_t* woptions;
rocksdb_t* db;
rocksdb_comparator_t* cmp;
rocksdb_cache_t* cache;
rocksdb_env_t* env;
rocksdb_options_t* options;
rocksdb_readoptions_t* roptions;
rocksdb_writeoptions_t* woptions;
char* err = NULL;
int run = -1;
snprintf(dbname, sizeof(dbname),
"%s/leveldb_c_test-%d",
"%s/rocksdb_c_test-%d",
GetTempDir(),
((int) geteuid()));
StartPhase("create_objects");
cmp = leveldb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
env = leveldb_create_default_env();
cache = leveldb_cache_create_lru(100000);
options = leveldb_options_create();
leveldb_options_set_comparator(options, cmp);
leveldb_options_set_error_if_exists(options, 1);
leveldb_options_set_cache(options, cache);
leveldb_options_set_env(options, env);
leveldb_options_set_info_log(options, NULL);
leveldb_options_set_write_buffer_size(options, 100000);
leveldb_options_set_paranoid_checks(options, 1);
leveldb_options_set_max_open_files(options, 10);
leveldb_options_set_block_size(options, 1024);
leveldb_options_set_block_restart_interval(options, 8);
leveldb_options_set_compression(options, leveldb_no_compression);
leveldb_options_set_compression_options(options, -14, -1, 0);
int compression_levels[] = {leveldb_no_compression, leveldb_no_compression,
leveldb_no_compression, leveldb_no_compression};
leveldb_options_set_compression_per_level(options, compression_levels, 4);
roptions = leveldb_readoptions_create();
leveldb_readoptions_set_verify_checksums(roptions, 1);
leveldb_readoptions_set_fill_cache(roptions, 0);
woptions = leveldb_writeoptions_create();
leveldb_writeoptions_set_sync(woptions, 1);
cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
env = rocksdb_create_default_env();
cache = rocksdb_cache_create_lru(100000);
options = rocksdb_options_create();
rocksdb_options_set_comparator(options, cmp);
rocksdb_options_set_error_if_exists(options, 1);
rocksdb_options_set_cache(options, cache);
rocksdb_options_set_env(options, env);
rocksdb_options_set_info_log(options, NULL);
rocksdb_options_set_write_buffer_size(options, 100000);
rocksdb_options_set_paranoid_checks(options, 1);
rocksdb_options_set_max_open_files(options, 10);
rocksdb_options_set_block_size(options, 1024);
rocksdb_options_set_block_restart_interval(options, 8);
rocksdb_options_set_compression(options, rocksdb_no_compression);
rocksdb_options_set_compression_options(options, -14, -1, 0);
int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
rocksdb_no_compression, rocksdb_no_compression};
rocksdb_options_set_compression_per_level(options, compression_levels, 4);
roptions = rocksdb_readoptions_create();
rocksdb_readoptions_set_verify_checksums(roptions, 1);
rocksdb_readoptions_set_fill_cache(roptions, 0);
woptions = rocksdb_writeoptions_create();
rocksdb_writeoptions_set_sync(woptions, 1);
StartPhase("destroy");
leveldb_destroy_db(options, dbname, &err);
rocksdb_destroy_db(options, dbname, &err);
Free(&err);
StartPhase("open_error");
db = leveldb_open(options, dbname, &err);
db = rocksdb_open(options, dbname, &err);
CheckCondition(err != NULL);
Free(&err);
StartPhase("open");
leveldb_options_set_create_if_missing(options, 1);
db = leveldb_open(options, dbname, &err);
rocksdb_options_set_create_if_missing(options, 1);
db = rocksdb_open(options, dbname, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", NULL);
StartPhase("put");
leveldb_put(db, woptions, "foo", 3, "hello", 5, &err);
rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", "hello");
StartPhase("compactall");
leveldb_compact_range(db, NULL, 0, NULL, 0);
rocksdb_compact_range(db, NULL, 0, NULL, 0);
CheckGet(db, roptions, "foo", "hello");
StartPhase("compactrange");
leveldb_compact_range(db, "a", 1, "z", 1);
rocksdb_compact_range(db, "a", 1, "z", 1);
CheckGet(db, roptions, "foo", "hello");
StartPhase("writebatch");
{
leveldb_writebatch_t* wb = leveldb_writebatch_create();
leveldb_writebatch_put(wb, "foo", 3, "a", 1);
leveldb_writebatch_clear(wb);
leveldb_writebatch_put(wb, "bar", 3, "b", 1);
leveldb_writebatch_put(wb, "box", 3, "c", 1);
leveldb_writebatch_delete(wb, "bar", 3);
leveldb_write(db, woptions, wb, &err);
rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
rocksdb_writebatch_clear(wb);
rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
rocksdb_writebatch_put(wb, "box", 3, "c", 1);
rocksdb_writebatch_delete(wb, "bar", 3);
rocksdb_write(db, woptions, wb, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", "hello");
CheckGet(db, roptions, "bar", NULL);
CheckGet(db, roptions, "box", "c");
int pos = 0;
leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
CheckCondition(pos == 3);
leveldb_writebatch_destroy(wb);
rocksdb_writebatch_destroy(wb);
}
StartPhase("iter");
{
leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
CheckCondition(!leveldb_iter_valid(iter));
leveldb_iter_seek_to_first(iter);
CheckCondition(leveldb_iter_valid(iter));
rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
CheckCondition(!rocksdb_iter_valid(iter));
rocksdb_iter_seek_to_first(iter);
CheckCondition(rocksdb_iter_valid(iter));
CheckIter(iter, "box", "c");
leveldb_iter_next(iter);
rocksdb_iter_next(iter);
CheckIter(iter, "foo", "hello");
leveldb_iter_prev(iter);
rocksdb_iter_prev(iter);
CheckIter(iter, "box", "c");
leveldb_iter_prev(iter);
CheckCondition(!leveldb_iter_valid(iter));
leveldb_iter_seek_to_last(iter);
rocksdb_iter_prev(iter);
CheckCondition(!rocksdb_iter_valid(iter));
rocksdb_iter_seek_to_last(iter);
CheckIter(iter, "foo", "hello");
leveldb_iter_seek(iter, "b", 1);
rocksdb_iter_seek(iter, "b", 1);
CheckIter(iter, "box", "c");
leveldb_iter_get_error(iter, &err);
rocksdb_iter_get_error(iter, &err);
CheckNoError(err);
leveldb_iter_destroy(iter);
rocksdb_iter_destroy(iter);
}
StartPhase("approximate_sizes");
@ -279,39 +279,39 @@ int main(int argc, char** argv) {
size_t start_len[2] = { 1, 21 };
const char* limit[2] = { "k00000000000000010000", "z" };
size_t limit_len[2] = { 21, 1 };
leveldb_writeoptions_set_sync(woptions, 0);
rocksdb_writeoptions_set_sync(woptions, 0);
for (i = 0; i < n; i++) {
snprintf(keybuf, sizeof(keybuf), "k%020d", i);
snprintf(valbuf, sizeof(valbuf), "v%020d", i);
leveldb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
&err);
CheckNoError(err);
}
leveldb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
CheckCondition(sizes[0] > 0);
CheckCondition(sizes[1] > 0);
}
StartPhase("property");
{
char* prop = leveldb_property_value(db, "nosuchprop");
char* prop = rocksdb_property_value(db, "nosuchprop");
CheckCondition(prop == NULL);
prop = leveldb_property_value(db, "rocksdb.stats");
prop = rocksdb_property_value(db, "rocksdb.stats");
CheckCondition(prop != NULL);
Free(&prop);
}
StartPhase("snapshot");
{
const leveldb_snapshot_t* snap;
snap = leveldb_create_snapshot(db);
leveldb_delete(db, woptions, "foo", 3, &err);
const rocksdb_snapshot_t* snap;
snap = rocksdb_create_snapshot(db);
rocksdb_delete(db, woptions, "foo", 3, &err);
CheckNoError(err);
leveldb_readoptions_set_snapshot(roptions, snap);
rocksdb_readoptions_set_snapshot(roptions, snap);
CheckGet(db, roptions, "foo", "hello");
leveldb_readoptions_set_snapshot(roptions, NULL);
rocksdb_readoptions_set_snapshot(roptions, NULL);
CheckGet(db, roptions, "foo", NULL);
leveldb_release_snapshot(db, snap);
rocksdb_release_snapshot(db, snap);
}
StartPhase("repair");
@ -320,44 +320,44 @@ int main(int argc, char** argv) {
// files (https://reviews.facebook.net/D6123) would leave
// around deleted files and the repair process will find
// those files and put them back into the database.
leveldb_compact_range(db, NULL, 0, NULL, 0);
leveldb_close(db);
leveldb_options_set_create_if_missing(options, 0);
leveldb_options_set_error_if_exists(options, 0);
leveldb_repair_db(options, dbname, &err);
rocksdb_compact_range(db, NULL, 0, NULL, 0);
rocksdb_close(db);
rocksdb_options_set_create_if_missing(options, 0);
rocksdb_options_set_error_if_exists(options, 0);
rocksdb_repair_db(options, dbname, &err);
CheckNoError(err);
db = leveldb_open(options, dbname, &err);
db = rocksdb_open(options, dbname, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", NULL);
CheckGet(db, roptions, "bar", NULL);
CheckGet(db, roptions, "box", "c");
leveldb_options_set_create_if_missing(options, 1);
leveldb_options_set_error_if_exists(options, 1);
rocksdb_options_set_create_if_missing(options, 1);
rocksdb_options_set_error_if_exists(options, 1);
}
StartPhase("filter");
for (run = 0; run < 2; run++) {
// First run uses custom filter, second run uses bloom filter
CheckNoError(err);
leveldb_filterpolicy_t* policy;
rocksdb_filterpolicy_t* policy;
if (run == 0) {
policy = leveldb_filterpolicy_create(
policy = rocksdb_filterpolicy_create(
NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName);
} else {
policy = leveldb_filterpolicy_create_bloom(10);
policy = rocksdb_filterpolicy_create_bloom(10);
}
// Create new database
leveldb_close(db);
leveldb_destroy_db(options, dbname, &err);
leveldb_options_set_filter_policy(options, policy);
db = leveldb_open(options, dbname, &err);
rocksdb_close(db);
rocksdb_destroy_db(options, dbname, &err);
rocksdb_options_set_filter_policy(options, policy);
db = rocksdb_open(options, dbname, &err);
CheckNoError(err);
leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
CheckNoError(err);
leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
CheckNoError(err);
leveldb_compact_range(db, NULL, 0, NULL, 0);
rocksdb_compact_range(db, NULL, 0, NULL, 0);
fake_filter_result = 1;
CheckGet(db, roptions, "foo", "foovalue");
@ -372,18 +372,18 @@ int main(int argc, char** argv) {
CheckGet(db, roptions, "foo", "foovalue");
CheckGet(db, roptions, "bar", "barvalue");
}
leveldb_options_set_filter_policy(options, NULL);
leveldb_filterpolicy_destroy(policy);
rocksdb_options_set_filter_policy(options, NULL);
rocksdb_filterpolicy_destroy(policy);
}
StartPhase("cleanup");
leveldb_close(db);
leveldb_options_destroy(options);
leveldb_readoptions_destroy(roptions);
leveldb_writeoptions_destroy(woptions);
leveldb_cache_destroy(cache);
leveldb_comparator_destroy(cmp);
leveldb_env_destroy(env);
rocksdb_close(db);
rocksdb_options_destroy(options);
rocksdb_readoptions_destroy(roptions);
rocksdb_writeoptions_destroy(woptions);
rocksdb_cache_destroy(cache);
rocksdb_comparator_destroy(cmp);
rocksdb_env_destroy(env);
fprintf(stderr, "PASS\n");
return 0;

@ -49,6 +49,7 @@ DEFINE_string(benchmarks,
"compact,"
"readrandom,"
"readseq,"
"readtocache,"
"readreverse,"
"readwhilewriting,"
"readrandomwriterandom,"
@ -76,6 +77,7 @@ DEFINE_string(benchmarks,
"\tdeleteseq -- delete N keys in sequential order\n"
"\tdeleterandom -- delete N keys in random order\n"
"\treadseq -- read N times sequentially\n"
"\treadtocache -- 1 thread reading database sequentially\n"
"\treadreverse -- read N times in reverse order\n"
"\treadrandom -- read N times in random order\n"
"\treadmissing -- read N missing keys in random order\n"
@ -150,7 +152,7 @@ DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
DEFINE_bool(histogram, false, "Print histogram of operation timings");
DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size,
DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size,
"Number of bytes to buffer in memtable before compacting");
DEFINE_int32(max_write_buffer_number,
@ -1062,6 +1064,10 @@ class Benchmark {
method = &Benchmark::WriteRandom;
} else if (name == Slice("readseq")) {
method = &Benchmark::ReadSequential;
} else if (name == Slice("readtocache")) {
method = &Benchmark::ReadSequential;
num_threads = 1;
reads_ = num_;
} else if (name == Slice("readreverse")) {
method = &Benchmark::ReadReverse;
} else if (name == Slice("readrandom")) {

@ -241,6 +241,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
mem_(new MemTable(internal_comparator_, mem_rep_factory_,
NumberLevels(), options_)),
logfile_number_(0),
super_version_(nullptr),
tmp_batch_(),
bg_compaction_scheduled_(0),
bg_flush_scheduled_(0),
@ -316,6 +317,13 @@ DBImpl::~DBImpl() {
bg_logstats_scheduled_) {
bg_cv_.Wait();
}
if (super_version_ != nullptr) {
bool is_last_reference __attribute__((unused));
is_last_reference = super_version_->Unref();
assert(is_last_reference);
super_version_->Cleanup();
delete super_version_;
}
mutex_.Unlock();
if (db_lock_ != nullptr) {
@ -345,6 +353,13 @@ void DBImpl::TEST_Destroy_DBImpl() {
bg_logstats_scheduled_) {
bg_cv_.Wait();
}
if (super_version_ != nullptr) {
bool is_last_reference __attribute__((unused));
is_last_reference = super_version_->Unref();
assert(is_last_reference);
super_version_->Cleanup();
delete super_version_;
}
// Prevent new compactions from occuring.
bg_work_gate_closed_ = true;
@ -443,6 +458,49 @@ void DBImpl::MaybeDumpStats() {
}
}
// DBImpl::SuperVersion methods
DBImpl::SuperVersion::SuperVersion(const int num_memtables) {
to_delete.resize(num_memtables);
}
DBImpl::SuperVersion::~SuperVersion() {
for (auto td : to_delete) {
delete td;
}
}
DBImpl::SuperVersion* DBImpl::SuperVersion::Ref() {
refs.fetch_add(1, std::memory_order_relaxed);
return this;
}
bool DBImpl::SuperVersion::Unref() {
assert(refs > 0);
// fetch_sub returns the previous value of ref
return refs.fetch_sub(1, std::memory_order_relaxed) == 1;
}
void DBImpl::SuperVersion::Cleanup() {
assert(refs.load(std::memory_order_relaxed) == 0);
imm.UnrefAll(&to_delete);
MemTable* m = mem->Unref();
if (m != nullptr) {
to_delete.push_back(m);
}
current->Unref();
}
void DBImpl::SuperVersion::Init(MemTable* new_mem, const MemTableList& new_imm,
Version* new_current) {
mem = new_mem;
imm = new_imm;
current = new_current;
mem->Ref();
imm.RefAll();
current->Ref();
refs.store(1, std::memory_order_relaxed);
}
// Returns the list of live files in 'sst_live' and the list
// of all files in the filesystem in 'all_files'.
// no_full_scan = true -- never do the full scan using GetChildren()
@ -518,11 +576,6 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
// It is not necessary to hold the mutex when invoking this method.
void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
// free pending memtables
for (auto m : state.memtables_to_free) {
delete m;
}
// check if there is anything to do
if (!state.all_files.size() &&
!state.sst_delete_files.size() &&
@ -1041,6 +1094,7 @@ Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) {
stats.bytes_written = meta.file_size;
stats.files_out_levelnp1 = 1;
stats_[level].Add(stats);
RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size);
return s;
}
@ -1129,6 +1183,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
stats.micros = env_->NowMicros() - start_micros;
stats.bytes_written = meta.file_size;
stats_[level].Add(stats);
RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size);
return s;
}
@ -1186,6 +1241,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
file_number, pending_outputs_, &deletion_state.memtables_to_free);
if (s.ok()) {
InstallSuperVersion(deletion_state);
if (madeProgress) {
*madeProgress = 1;
}
@ -1245,11 +1301,17 @@ int DBImpl::FindMinimumEmptyLevelFitting(int level) {
void DBImpl::ReFitLevel(int level, int target_level) {
assert(level < NumberLevels());
MutexLock l(&mutex_);
SuperVersion* superversion_to_free = nullptr;
SuperVersion* new_superversion =
new SuperVersion(options_.max_write_buffer_number);
mutex_.Lock();
// only allow one thread refitting
if (refitting_level_) {
mutex_.Unlock();
Log(options_.info_log, "ReFitLevel: another thread is refitting");
delete new_superversion;
return;
}
refitting_level_ = true;
@ -1285,6 +1347,8 @@ void DBImpl::ReFitLevel(int level, int target_level) {
edit.DebugString().data());
auto status = versions_->LogAndApply(&edit, &mutex_);
superversion_to_free = InstallSuperVersion(new_superversion);
new_superversion = nullptr;
Log(options_.info_log, "LogAndApply: %s\n", status.ToString().data());
@ -1296,6 +1360,10 @@ void DBImpl::ReFitLevel(int level, int target_level) {
refitting_level_ = false;
bg_work_gate_closed_ = false;
mutex_.Unlock();
delete superversion_to_free;
delete new_superversion;
}
int DBImpl::NumberLevels() {
@ -1311,8 +1379,7 @@ int DBImpl::Level0StopWriteTrigger() {
}
Status DBImpl::Flush(const FlushOptions& options) {
Status status = FlushMemTable(options);
return status;
return FlushMemTable(options);
}
SequenceNumber DBImpl::GetLatestSequenceNumber() const {
@ -1669,7 +1736,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress,
void DBImpl::BackgroundCallFlush() {
bool madeProgress = false;
DeletionState deletion_state(options_.max_write_buffer_number);
DeletionState deletion_state(options_.max_write_buffer_number, true);
assert(bg_flush_scheduled_);
MutexLock l(&mutex_);
@ -1715,7 +1782,7 @@ void DBImpl::TEST_PurgeObsoleteteWAL() {
void DBImpl::BackgroundCallCompaction() {
bool madeProgress = false;
DeletionState deletion_state(options_.max_write_buffer_number);
DeletionState deletion_state(options_.max_write_buffer_number, true);
MaybeDumpStats();
@ -1768,7 +1835,7 @@ void DBImpl::BackgroundCallCompaction() {
}
Status DBImpl::BackgroundCompaction(bool* madeProgress,
DeletionState& deletion_state) {
DeletionState& deletion_state) {
*madeProgress = false;
mutex_.AssertHeld();
@ -1821,6 +1888,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
f->smallest, f->largest,
f->smallest_seqno, f->largest_seqno);
status = versions_->LogAndApply(c->edit(), &mutex_);
InstallSuperVersion(deletion_state);
VersionSet::LevelSummaryStorage tmp;
Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
static_cast<unsigned long long>(f->number),
@ -2454,14 +2522,22 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
}
stats.files_out_levelnp1 = num_output_files;
for (int i = 0; i < compact->compaction->num_input_files(0); i++)
for (int i = 0; i < compact->compaction->num_input_files(0); i++) {
stats.bytes_readn += compact->compaction->input(0, i)->file_size;
RecordTick(options_.statistics.get(), COMPACT_READ_BYTES,
compact->compaction->input(0, i)->file_size);
}
for (int i = 0; i < compact->compaction->num_input_files(1); i++)
for (int i = 0; i < compact->compaction->num_input_files(1); i++) {
stats.bytes_readnp1 += compact->compaction->input(1, i)->file_size;
RecordTick(options_.statistics.get(), COMPACT_READ_BYTES,
compact->compaction->input(1, i)->file_size);
}
for (int i = 0; i < num_output_files; i++) {
stats.bytes_written += compact->outputs[i].file_size;
RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES,
compact->outputs[i].file_size);
}
LogFlush(options_.info_log);
@ -2474,6 +2550,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
if (status.ok()) {
status = InstallCompactionResults(compact);
InstallSuperVersion(deletion_state);
}
VersionSet::LevelSummaryStorage tmp;
Log(options_.info_log,
@ -2581,6 +2658,44 @@ Status DBImpl::Get(const ReadOptions& options,
return GetImpl(options, key, value);
}
// DeletionState gets created and destructed outside of the lock -- we
// use this convinently to:
// * malloc one SuperVersion() outside of the lock -- new_superversion
// * delete one SuperVersion() outside of the lock -- superversion_to_free
//
// However, if InstallSuperVersion() gets called twice with the same,
// deletion_state, we can't reuse the SuperVersion() that got malloced because
// first call already used it. In that rare case, we take a hit and create a
// new SuperVersion() inside of the mutex. We do similar thing
// for superversion_to_free
void DBImpl::InstallSuperVersion(DeletionState& deletion_state) {
// if new_superversion == nullptr, it means somebody already used it
SuperVersion* new_superversion =
(deletion_state.new_superversion != nullptr) ?
deletion_state.new_superversion : new SuperVersion();
SuperVersion* old_superversion = InstallSuperVersion(new_superversion);
deletion_state.new_superversion = nullptr;
if (deletion_state.superversion_to_free != nullptr) {
// somebody already put it there
delete old_superversion;
} else {
deletion_state.superversion_to_free = old_superversion;
}
}
DBImpl::SuperVersion* DBImpl::InstallSuperVersion(
SuperVersion* new_superversion) {
mutex_.AssertHeld();
new_superversion->Init(mem_, imm_, versions_->current());
SuperVersion* old_superversion = super_version_;
super_version_ = new_superversion;
if (old_superversion != nullptr && old_superversion->Unref()) {
old_superversion->Cleanup();
return old_superversion; // will let caller delete outside of mutex
}
return nullptr;
}
Status DBImpl::GetImpl(const ReadOptions& options,
const Slice& key,
std::string* value,
@ -2591,28 +2706,21 @@ Status DBImpl::GetImpl(const ReadOptions& options,
StopWatchNano snapshot_timer(env_, false);
StartPerfTimer(&snapshot_timer);
SequenceNumber snapshot;
std::vector<MemTable*> to_delete;
to_delete.reserve(options_.max_write_buffer_number);
mutex_.Lock();
if (options.snapshot != nullptr) {
snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
} else {
snapshot = versions_->LastSequence();
}
MemTable* mem = mem_;
MemTableList imm = imm_;
Version* current = versions_->current();
mem->Ref();
imm.RefAll();
current->Ref();
// Unlock while reading from files and memtables
// This can be replaced by using atomics and spinlock instead of big mutex
mutex_.Lock();
SuperVersion* get_version = super_version_->Ref();
mutex_.Unlock();
bool have_stat_update = false;
Version::GetStats stats;
// Prepare to store a list of merge operations if merge occurs.
MergeContext merge_context;
@ -2621,18 +2729,18 @@ Status DBImpl::GetImpl(const ReadOptions& options,
// merge_operands will contain the sequence of merges in the latter case.
LookupKey lkey(key, snapshot);
BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
if (mem->Get(lkey, value, &s, merge_context, options_)) {
if (get_version->mem->Get(lkey, value, &s, merge_context, options_)) {
// Done
RecordTick(options_.statistics.get(), MEMTABLE_HIT);
} else if (imm.Get(lkey, value, &s, merge_context, options_)) {
} else if (get_version->imm.Get(lkey, value, &s, merge_context, options_)) {
// Done
RecordTick(options_.statistics.get(), MEMTABLE_HIT);
} else {
StopWatchNano from_files_timer(env_, false);
StartPerfTimer(&from_files_timer);
current->Get(options, lkey, value, &s, &merge_context, &stats,
options_, value_found);
get_version->current->Get(options, lkey, value, &s, &merge_context, &stats,
options_, value_found);
have_stat_update = true;
BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer);
RecordTick(options_.statistics.get(), MEMTABLE_MISS);
@ -2640,22 +2748,30 @@ Status DBImpl::GetImpl(const ReadOptions& options,
StopWatchNano post_process_timer(env_, false);
StartPerfTimer(&post_process_timer);
mutex_.Lock();
if (!options_.disable_seek_compaction &&
have_stat_update && current->UpdateStats(stats)) {
MaybeScheduleFlushOrCompaction();
bool delete_get_version = false;
if (!options_.disable_seek_compaction && have_stat_update) {
mutex_.Lock();
if (get_version->current->UpdateStats(stats)) {
MaybeScheduleFlushOrCompaction();
}
if (get_version->Unref()) {
get_version->Cleanup();
delete_get_version = true;
}
mutex_.Unlock();
} else {
if (get_version->Unref()) {
mutex_.Lock();
get_version->Cleanup();
mutex_.Unlock();
delete_get_version = true;
}
}
if (delete_get_version) {
delete get_version;
}
MemTable* m = mem->Unref();
imm.UnrefAll(&to_delete);
current->Unref();
mutex_.Unlock();
// free up all obsolete memtables outside the mutex
delete m;
for (MemTable* v: to_delete) delete v;
LogFlush(options_.info_log);
// Note, tickers are atomic now - no lock protection needed any more.
RecordTick(options_.statistics.get(), NUMBER_KEYS_READ);
@ -2673,7 +2789,6 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
SequenceNumber snapshot;
std::vector<MemTable*> to_delete;
to_delete.reserve(options_.max_write_buffer_number);
mutex_.Lock();
if (options.snapshot != nullptr) {
@ -2748,8 +2863,6 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
delete m;
for (MemTable* v: to_delete) delete v;
LogFlush(options_.info_log);
RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS);
RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, numKeys);
RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytesRead);
@ -2831,17 +2944,27 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
w.done = false;
StopWatch sw(env_, options_.statistics.get(), DB_WRITE);
MutexLock l(&mutex_);
mutex_.Lock();
writers_.push_back(&w);
while (!w.done && &w != writers_.front()) {
w.cv.Wait();
}
if (!options.disableWAL) {
RecordTick(options_.statistics.get(), WRITE_WITH_WAL, 1);
}
if (w.done) {
mutex_.Unlock();
RecordTick(options_.statistics.get(), WRITE_DONE_BY_OTHER, 1);
return w.status;
} else {
RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1);
}
// May temporarily unlock and wait.
Status status = MakeRoomForWrite(my_batch == nullptr);
SuperVersion* superversion_to_free = nullptr;
Status status = MakeRoomForWrite(my_batch == nullptr, &superversion_to_free);
uint64_t last_sequence = versions_->LastSequence();
Writer* last_writer = &w;
if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions
@ -2877,7 +3000,10 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
if (!options.disableWAL) {
StopWatchNano timer(env_);
StartPerfTimer(&timer);
status = log_->AddRecord(WriteBatchInternal::Contents(updates));
Slice log_entry = WriteBatchInternal::Contents(updates);
status = log_->AddRecord(log_entry);
RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1);
RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size());
if (status.ok() && options.sync) {
if (options_.use_fsync) {
StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS);
@ -2906,7 +3032,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
SEQUENCE_NUMBER, last_sequence);
}
StartPerfTimer(&pre_post_process_timer);
LogFlush(options_.info_log);
mutex_.Lock();
if (status.ok()) {
versions_->SetLastSequence(last_sequence);
@ -2933,6 +3058,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
if (!writers_.empty()) {
writers_.front()->cv.Signal();
}
mutex_.Unlock();
delete superversion_to_free;
BumpPerfTime(&perf_context.write_pre_and_post_process_time,
&pre_post_process_timer);
return status;
@ -3027,7 +3154,8 @@ uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) {
// REQUIRES: mutex_ is held
// REQUIRES: this thread is currently at the front of the writer queue
Status DBImpl::MakeRoomForWrite(bool force) {
Status DBImpl::MakeRoomForWrite(bool force,
SuperVersion** superversion_to_free) {
mutex_.AssertHeld();
assert(!writers_.empty());
bool allow_delay = !force;
@ -3036,6 +3164,7 @@ Status DBImpl::MakeRoomForWrite(bool force) {
uint64_t rate_limit_delay_millis = 0;
Status s;
double score;
*superversion_to_free = nullptr;
while (true) {
if (!bg_error_.ok()) {
@ -3162,6 +3291,7 @@ Status DBImpl::MakeRoomForWrite(bool force) {
// Do this without holding the dbmutex lock.
assert(versions_->PrevLogNumber() == 0);
uint64_t new_log_number = versions_->NewFileNumber();
SuperVersion* new_superversion = nullptr;
mutex_.Unlock();
{
EnvOptions soptions(storage_options_);
@ -3178,6 +3308,7 @@ Status DBImpl::MakeRoomForWrite(bool force) {
lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size);
memtmp = new MemTable(
internal_comparator_, mem_rep_factory_, NumberLevels(), options_);
new_superversion = new SuperVersion(options_.max_write_buffer_number);
}
}
mutex_.Lock();
@ -3202,11 +3333,16 @@ Status DBImpl::MakeRoomForWrite(bool force) {
mem_->SetLogNumber(logfile_number_);
force = false; // Do not force another compaction if have room
MaybeScheduleFlushOrCompaction();
*superversion_to_free = InstallSuperVersion(new_superversion);
}
}
return s;
}
const std::string& DBImpl::GetName() const {
return dbname_;
}
Env* DBImpl::GetEnv() const {
return env_;
}
@ -3256,6 +3392,13 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
} else if (in == "stats") {
char buf[1000];
uint64_t wal_bytes = 0;
uint64_t wal_synced = 0;
uint64_t user_bytes_written = 0;
uint64_t write_other = 0;
uint64_t write_self = 0;
uint64_t write_with_wal = 0;
uint64_t total_bytes_written = 0;
uint64_t total_bytes_read = 0;
uint64_t micros_up = env_->NowMicros() - started_at_;
@ -3268,6 +3411,16 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
uint64_t interval_bytes_new = 0;
double interval_seconds_up = 0;
Statistics* s = options_.statistics.get();
if (s) {
wal_bytes = s->getTickerCount(WAL_FILE_BYTES);
wal_synced = s->getTickerCount(WAL_FILE_SYNCED);
user_bytes_written = s->getTickerCount(BYTES_WRITTEN);
write_other = s->getTickerCount(WRITE_DONE_BY_OTHER);
write_self = s->getTickerCount(WRITE_DONE_BY_SELF);
write_with_wal = s->getTickerCount(WRITE_WITH_WAL);
}
// Pardon the long line but I think it is easier to read this way.
snprintf(buf, sizeof(buf),
" Compactions\n"
@ -3324,19 +3477,38 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
}
}
interval_bytes_new = stats_[0].bytes_written - last_stats_.bytes_new_;
interval_bytes_read = total_bytes_read - last_stats_.bytes_read_;
interval_bytes_written = total_bytes_written - last_stats_.bytes_written_;
interval_bytes_new = user_bytes_written - last_stats_.ingest_bytes_;
interval_bytes_read = total_bytes_read - last_stats_.compaction_bytes_read_;
interval_bytes_written =
total_bytes_written - last_stats_.compaction_bytes_written_;
interval_seconds_up = seconds_up - last_stats_.seconds_up_;
snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
seconds_up, interval_seconds_up);
value->append(buf);
snprintf(buf, sizeof(buf),
"Writes cumulative: %llu total, %llu batches, "
"%.1f per batch, %.2f ingest GB\n",
(unsigned long long) (write_other + write_self),
(unsigned long long) write_self,
(write_other + write_self) / (double) (write_self + 1),
user_bytes_written / (1048576.0 * 1024));
value->append(buf);
snprintf(buf, sizeof(buf),
"WAL cumulative: %llu WAL writes, %llu WAL syncs, "
"%.2f writes per sync, %.2f GB written\n",
(unsigned long long) write_with_wal,
(unsigned long long ) wal_synced,
write_with_wal / (double) (wal_synced + 1),
wal_bytes / (1048576.0 * 1024));
value->append(buf);
snprintf(buf, sizeof(buf),
"Compaction IO cumulative (GB): "
"%.2f new, %.2f read, %.2f write, %.2f read+write\n",
stats_[0].bytes_written / (1048576.0 * 1024),
user_bytes_written / (1048576.0 * 1024),
total_bytes_read / (1048576.0 * 1024),
total_bytes_written / (1048576.0 * 1024),
(total_bytes_read + total_bytes_written) / (1048576.0 * 1024));
@ -3345,7 +3517,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
snprintf(buf, sizeof(buf),
"Compaction IO cumulative (MB/sec): "
"%.1f new, %.1f read, %.1f write, %.1f read+write\n",
stats_[0].bytes_written / 1048576.0 / seconds_up,
user_bytes_written / 1048576.0 / seconds_up,
total_bytes_read / 1048576.0 / seconds_up,
total_bytes_written / 1048576.0 / seconds_up,
(total_bytes_read + total_bytes_written) / 1048576.0 / seconds_up);
@ -3354,9 +3526,38 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
// +1 to avoid divide by 0 and NaN
snprintf(buf, sizeof(buf),
"Amplification cumulative: %.1f write, %.1f compaction\n",
(double) total_bytes_written / (stats_[0].bytes_written+1),
(double) (total_bytes_written + total_bytes_read)
/ (stats_[0].bytes_written+1));
(double) (total_bytes_written + wal_bytes)
/ (user_bytes_written + 1),
(double) (total_bytes_written + total_bytes_read + wal_bytes)
/ (user_bytes_written + 1));
value->append(buf);
uint64_t interval_write_other = write_other - last_stats_.write_other_;
uint64_t interval_write_self = write_self - last_stats_.write_self_;
snprintf(buf, sizeof(buf),
"Writes interval: %llu total, %llu batches, "
"%.1f per batch, %.1f ingest MB\n",
(unsigned long long) (interval_write_other + interval_write_self),
(unsigned long long) interval_write_self,
(double) (interval_write_other + interval_write_self)
/ (interval_write_self + 1),
(user_bytes_written - last_stats_.ingest_bytes_) / 1048576.0);
value->append(buf);
uint64_t interval_write_with_wal =
write_with_wal - last_stats_.write_with_wal_;
uint64_t interval_wal_synced = wal_synced - last_stats_.wal_synced_;
uint64_t interval_wal_bytes = wal_bytes - last_stats_.wal_bytes_;
snprintf(buf, sizeof(buf),
"WAL interval: %llu WAL writes, %llu WAL syncs, "
"%.2f writes per sync, %.2f MB written\n",
(unsigned long long) interval_write_with_wal,
(unsigned long long ) interval_wal_synced,
interval_write_with_wal / (double) (interval_wal_synced + 1),
interval_wal_bytes / (1048576.0 * 1024));
value->append(buf);
snprintf(buf, sizeof(buf),
@ -3381,9 +3582,10 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
// +1 to avoid divide by 0 and NaN
snprintf(buf, sizeof(buf),
"Amplification interval: %.1f write, %.1f compaction\n",
(double) interval_bytes_written / (interval_bytes_new+1),
(double) (interval_bytes_written + interval_bytes_read) /
(interval_bytes_new+1));
(double) (interval_bytes_written + wal_bytes)
/ (interval_bytes_new + 1),
(double) (interval_bytes_written + interval_bytes_read + wal_bytes)
/ (interval_bytes_new + 1));
value->append(buf);
snprintf(buf, sizeof(buf),
@ -3404,10 +3606,15 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
(unsigned long) total_slowdown_count);
value->append(buf);
last_stats_.bytes_read_ = total_bytes_read;
last_stats_.bytes_written_ = total_bytes_written;
last_stats_.bytes_new_ = stats_[0].bytes_written;
last_stats_.compaction_bytes_read_ = total_bytes_read;
last_stats_.compaction_bytes_written_ = total_bytes_written;
last_stats_.ingest_bytes_ = user_bytes_written;
last_stats_.seconds_up_ = seconds_up;
last_stats_.wal_bytes_ = wal_bytes;
last_stats_.wal_synced_ = wal_synced;
last_stats_.write_with_wal_ = write_with_wal;
last_stats_.write_other_ = write_other;
last_stats_.write_self_ = write_self;
return true;
} else if (in == "sstables") {
@ -3482,7 +3689,7 @@ Status DBImpl::DeleteFile(std::string name) {
FileMetaData metadata;
int maxlevel = NumberLevels();
VersionEdit edit(maxlevel);
DeletionState deletion_state;
DeletionState deletion_state(0, true);
{
MutexLock l(&mutex_);
status = versions_->GetMetadataForFile(number, &level, &metadata);
@ -3512,14 +3719,14 @@ Status DBImpl::DeleteFile(std::string name) {
}
edit.DeleteFile(level, number);
status = versions_->LogAndApply(&edit, &mutex_);
if (status.ok()) {
InstallSuperVersion(deletion_state);
}
FindObsoleteFiles(deletion_state, false);
} // lock released here
LogFlush(options_.info_log);
if (status.ok()) {
// remove files outside the db-lock
PurgeObsoleteFiles(deletion_state);
}
// remove files outside the db-lock
PurgeObsoleteFiles(deletion_state);
return status;
}
@ -3619,6 +3826,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
}
if (s.ok()) {
delete impl->InstallSuperVersion(new DBImpl::SuperVersion());
impl->mem_->SetLogNumber(impl->logfile_number_);
impl->DeleteObsoleteFiles();
impl->MaybeScheduleFlushOrCompaction();

@ -67,6 +67,7 @@ class DBImpl : public DB {
virtual int NumberLevels();
virtual int MaxMemCompactionLevel();
virtual int Level0StopWriteTrigger();
virtual const std::string& GetName() const;
virtual Env* GetEnv() const;
virtual const Options& GetOptions() const;
virtual Status Flush(const FlushOptions& options);
@ -127,12 +128,38 @@ class DBImpl : public DB {
default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
}
// needed for CleanupIteratorState
// holds references to memtable, all immutable memtables and version
struct SuperVersion {
MemTable* mem;
MemTableList imm;
Version* current;
std::atomic<uint32_t> refs;
// We need to_delete because during Cleanup(), imm.UnrefAll() returns
// all memtables that we need to free through this vector. We then
// delete all those memtables outside of mutex, during destruction
std::vector<MemTable*> to_delete;
// should be called outside the mutex
explicit SuperVersion(const int num_memtables = 0);
~SuperVersion();
SuperVersion* Ref();
// Returns true if this was the last reference and caller should
// call Clenaup() and delete the object
bool Unref();
// call these two methods with db mutex held
// Cleanup unrefs mem, imm and current. Also, it stores all memtables
// that needs to be deleted in to_delete vector. Unrefing those
// objects needs to be done in the mutex
void Cleanup();
void Init(MemTable* new_mem, const MemTableList& new_imm,
Version* new_current);
};
// needed for CleanupIteratorState
struct DeletionState {
inline bool HaveSomethingToDelete() const {
return memtables_to_free.size() ||
all_files.size() ||
return all_files.size() ||
sst_delete_files.size() ||
log_delete_files.size();
}
@ -154,15 +181,35 @@ class DBImpl : public DB {
// a list of memtables to be free
std::vector<MemTable *> memtables_to_free;
SuperVersion* superversion_to_free; // if nullptr nothing to free
SuperVersion* new_superversion; // if nullptr no new superversion
// the current manifest_file_number, log_number and prev_log_number
// that corresponds to the set of files in 'live'.
uint64_t manifest_file_number, log_number, prev_log_number;
explicit DeletionState(const int num_memtables = 0) {
explicit DeletionState(const int num_memtables = 0,
bool create_superversion = false) {
manifest_file_number = 0;
log_number = 0;
prev_log_number = 0;
memtables_to_free.reserve(num_memtables);
superversion_to_free = nullptr;
new_superversion =
create_superversion ? new SuperVersion(num_memtables) : nullptr;
}
~DeletionState() {
// free pending memtables
for (auto m : memtables_to_free) {
delete m;
}
// free superversion. if nullptr, this will be noop
delete superversion_to_free;
// if new_superversion was not used, it will be non-nullptr and needs
// to be freed here
delete new_superversion;
}
};
@ -239,7 +286,11 @@ class DBImpl : public DB {
uint64_t* filenumber);
uint64_t SlowdownAmount(int n, int top, int bottom);
Status MakeRoomForWrite(bool force /* compact even if there is room? */);
// MakeRoomForWrite will return superversion_to_free through an arugment,
// which the caller needs to delete. We do it because caller can delete
// the superversion outside of mutex
Status MakeRoomForWrite(bool force /* compact even if there is room? */,
SuperVersion** superversion_to_free);
WriteBatch* BuildBatchGroup(Writer** last_writer);
// Force current memtable contents to be flushed.
@ -323,6 +374,8 @@ class DBImpl : public DB {
uint64_t logfile_number_;
unique_ptr<log::Writer> log_;
SuperVersion* super_version_;
std::string host_name_;
// Queue of writers.
@ -440,15 +493,25 @@ class DBImpl : public DB {
// Used to compute per-interval statistics
struct StatsSnapshot {
uint64_t bytes_read_;
uint64_t bytes_written_;
uint64_t bytes_new_;
uint64_t compaction_bytes_read_; // Bytes read by compaction
uint64_t compaction_bytes_written_; // Bytes written by compaction
uint64_t ingest_bytes_; // Bytes written by user
uint64_t wal_bytes_; // Bytes written to WAL
uint64_t wal_synced_; // Number of times WAL is synced
uint64_t write_with_wal_; // Number of writes that request WAL
// These count the number of writes processed by the calling thread or
// another thread.
uint64_t write_other_;
uint64_t write_self_;
double seconds_up_;
StatsSnapshot() : bytes_read_(0), bytes_written_(0),
bytes_new_(0), seconds_up_(0) {}
StatsSnapshot() : compaction_bytes_read_(0), compaction_bytes_written_(0),
ingest_bytes_(0), wal_bytes_(0), wal_synced_(0),
write_with_wal_(0), write_other_(0), write_self_(0),
seconds_up_(0) {}
};
// Counters from the previous time per-interval stats were computed
StatsSnapshot last_stats_;
static const int KEEP_LOG_FILE_NUM = 1000;
@ -480,6 +543,18 @@ class DBImpl : public DB {
std::vector<SequenceNumber>& snapshots,
SequenceNumber* prev_snapshot);
// will return a pointer to SuperVersion* if previous SuperVersion
// if its reference count is zero and needs deletion or nullptr if not
// As argument takes a pointer to allocated SuperVersion
// Foreground threads call this function directly (they don't carry
// deletion state and have to handle their own creation and deletion
// of SuperVersion)
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion);
// Background threads call this function, which is just a wrapper around
// the InstallSuperVersion() function above. Background threads carry
// deletion_state which can have new_superversion already allocated.
void InstallSuperVersion(DeletionState& deletion_state);
// Function that Get and KeyMayExist call with no_io true or false
// Note: 'value_found' from KeyMayExist propagates here
Status GetImpl(const ReadOptions& options,

@ -701,23 +701,25 @@ static std::string Key(int i) {
return std::string(buf);
}
TEST(DBTest, Empty) {
/*
TEST(DBTest, GetFromImmutableLayer) {
do {
ASSERT_TRUE(db_ != nullptr);
ASSERT_EQ("NOT_FOUND", Get("foo"));
} while (ChangeOptions());
}
Options options = CurrentOptions();
options.env = env_;
options.write_buffer_size = 100000; // Small write buffer
Reopen(&options);
TEST(DBTest, ReadWrite) {
do {
ASSERT_OK(Put("foo", "v1"));
ASSERT_EQ("v1", Get("foo"));
ASSERT_OK(Put("bar", "v2"));
ASSERT_OK(Put("foo", "v3"));
ASSERT_EQ("v3", Get("foo"));
ASSERT_EQ("v2", Get("bar"));
env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls
Put("k1", std::string(100000, 'x')); // Fill memtable
Put("k2", std::string(100000, 'y')); // Trigger compaction
ASSERT_EQ("v1", Get("foo"));
env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls
} while (ChangeOptions());
}
*/
// Make sure that when options.block_cache is set, after a new table is
// created its index/filter blocks are added to block cache.
@ -731,7 +733,7 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
// Create a new talbe.
dbfull()->Flush(FlushOptions());
ASSERT_OK(dbfull()->Flush(FlushOptions()));
// index/filter blocks added to block cache right after table creation.
ASSERT_EQ(1,
@ -776,157 +778,6 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
}
TEST(DBTest, LevelLimitReopen) {
Options options = CurrentOptions();
Reopen(&options);
const std::string value(1024 * 1024, ' ');
int i = 0;
while (NumTableFilesAtLevel(2) == 0) {
ASSERT_OK(Put(Key(i++), value));
}
options.num_levels = 1;
options.max_bytes_for_level_multiplier_additional.resize(1, 1);
Status s = TryReopen(&options);
ASSERT_EQ(s.IsCorruption(), true);
ASSERT_EQ(s.ToString(),
"Corruption: VersionEdit: db already has "
"more levels than options.num_levels");
options.num_levels = 10;
options.max_bytes_for_level_multiplier_additional.resize(10, 1);
ASSERT_OK(TryReopen(&options));
}
TEST(DBTest, Preallocation) {
const std::string src = dbname_ + "/alloc_test";
unique_ptr<WritableFile> srcfile;
const EnvOptions soptions;
ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions));
srcfile->SetPreallocationBlockSize(1024 * 1024);
// No writes should mean no preallocation
size_t block_size, last_allocated_block;
srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
ASSERT_EQ(last_allocated_block, 0UL);
// Small write should preallocate one block
srcfile->Append("test");
srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
ASSERT_EQ(last_allocated_block, 1UL);
// Write an entire preallocation block, make sure we increased by two.
std::string buf(block_size, ' ');
srcfile->Append(buf);
srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
ASSERT_EQ(last_allocated_block, 2UL);
// Write five more blocks at once, ensure we're where we need to be.
buf = std::string(block_size * 5, ' ');
srcfile->Append(buf);
srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
ASSERT_EQ(last_allocated_block, 7UL);
}
TEST(DBTest, PutDeleteGet) {
do {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
ASSERT_EQ("v1", Get("foo"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
ASSERT_EQ("v2", Get("foo"));
ASSERT_OK(db_->Delete(WriteOptions(), "foo"));
ASSERT_EQ("NOT_FOUND", Get("foo"));
} while (ChangeOptions());
}
TEST(DBTest, GetFromImmutableLayer) {
do {
Options options = CurrentOptions();
options.env = env_;
options.write_buffer_size = 100000; // Small write buffer
Reopen(&options);
ASSERT_OK(Put("foo", "v1"));
ASSERT_EQ("v1", Get("foo"));
env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls
Put("k1", std::string(100000, 'x')); // Fill memtable
Put("k2", std::string(100000, 'y')); // Trigger compaction
ASSERT_EQ("v1", Get("foo"));
env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls
} while (ChangeOptions());
}
TEST(DBTest, GetFromVersions) {
do {
ASSERT_OK(Put("foo", "v1"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get("foo"));
} while (ChangeOptions());
}
TEST(DBTest, GetSnapshot) {
do {
// Try with both a short key and a long key
for (int i = 0; i < 2; i++) {
std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
ASSERT_OK(Put(key, "v1"));
const Snapshot* s1 = db_->GetSnapshot();
ASSERT_OK(Put(key, "v2"));
ASSERT_EQ("v2", Get(key));
ASSERT_EQ("v1", Get(key, s1));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v2", Get(key));
ASSERT_EQ("v1", Get(key, s1));
db_->ReleaseSnapshot(s1);
}
} while (ChangeOptions());
}
TEST(DBTest, GetLevel0Ordering) {
do {
// Check that we process level-0 files in correct order. The code
// below generates two level-0 files where the earlier one comes
// before the later one in the level-0 file list since the earlier
// one has a smaller "smallest" key.
ASSERT_OK(Put("bar", "b"));
ASSERT_OK(Put("foo", "v1"));
dbfull()->TEST_FlushMemTable();
ASSERT_OK(Put("foo", "v2"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v2", Get("foo"));
} while (ChangeOptions());
}
TEST(DBTest, GetOrderedByLevels) {
do {
ASSERT_OK(Put("foo", "v1"));
Compact("a", "z");
ASSERT_EQ("v1", Get("foo"));
ASSERT_OK(Put("foo", "v2"));
ASSERT_EQ("v2", Get("foo"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v2", Get("foo"));
} while (ChangeOptions());
}
TEST(DBTest, GetPicksCorrectFile) {
do {
// Arrange to have multiple files in a non-level-0 level.
ASSERT_OK(Put("a", "va"));
Compact("a", "b");
ASSERT_OK(Put("x", "vx"));
Compact("x", "y");
ASSERT_OK(Put("f", "vf"));
Compact("f", "g");
ASSERT_EQ("va", Get("a"));
ASSERT_EQ("vf", Get("f"));
ASSERT_EQ("vx", Get("x"));
} while (ChangeOptions());
}
TEST(DBTest, GetEncountersEmptyLevel) {
do {
// Arrange for the following to happen:
@ -4510,6 +4361,10 @@ class ModelDB: public DB {
return -1;
}
virtual const std::string& GetName() const {
return name_;
}
virtual Env* GetEnv() const {
return nullptr;
}
@ -4587,6 +4442,7 @@ class ModelDB: public DB {
};
const Options options_;
KVMap map_;
std::string name_ = "";
};
static std::string RandomKey(Random* rnd, int minimum = 0) {

@ -279,7 +279,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
*s = Status::Corruption("Error: Could not perform merge.");
}
} else {
*s = Status::NotFound(Slice());
*s = Status::NotFound();
}
found_final_value = true;
break;

@ -50,9 +50,8 @@ Status TableCache::FindTable(const EnvOptions& toptions,
Cache::Handle** handle, bool* table_io,
const bool no_io) {
Status s;
char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number);
Slice key(buf, sizeof(buf));
Slice key(reinterpret_cast<const char*>(&file_number), sizeof(file_number));
*handle = cache_->Lookup(key);
if (*handle == nullptr) {
if (no_io) { // Dont do IO and return a not-found status
@ -165,9 +164,8 @@ bool TableCache::PrefixMayMatch(const ReadOptions& options,
}
void TableCache::Evict(uint64_t file_number) {
char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number);
cache_->Erase(Slice(buf, sizeof(buf)));
Slice key(reinterpret_cast<const char*>(&file_number), sizeof(file_number));
cache_->Erase(key);
}
} // namespace rocksdb

@ -545,7 +545,7 @@ void Version::Get(const ReadOptions& options,
case kFound:
return;
case kDeleted:
*status = Status::NotFound(Slice()); // Use empty error message for speed
*status = Status::NotFound(); // Use empty error message for speed
return;
case kCorrupt:
*status = Status::Corruption("corrupted key for ", user_key);
@ -570,7 +570,7 @@ void Version::Get(const ReadOptions& options,
user_key);
}
} else {
*status = Status::NotFound(Slice()); // Use an empty error message for speed
*status = Status::NotFound(); // Use an empty error message for speed
}
}
@ -1112,12 +1112,6 @@ class VersionSet::Builder {
MaybeAddFile(v, level, *base_iter);
}
}
// Pre-sort level0 for Get()
if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo);
} else {
std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst);
}
CheckConsistency(v);
}
@ -1683,6 +1677,12 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
void VersionSet::Finalize(Version* v,
std::vector<uint64_t>& size_being_compacted) {
// Pre-sort level0 for Get()
if (options_->compaction_style == kCompactionStyleUniversal) {
std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo);
} else {
std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst);
}
double max_score = 0;
int max_score_level = 0;

@ -274,12 +274,14 @@ class VersionSet {
int64_t NumLevelBytes(int level) const;
// Return the last sequence number.
uint64_t LastSequence() const { return last_sequence_; }
uint64_t LastSequence() const {
return last_sequence_.load(std::memory_order_acquire);
}
// Set the last sequence number to s.
void SetLastSequence(uint64_t s) {
assert(s >= last_sequence_);
last_sequence_ = s;
last_sequence_.store(s, std::memory_order_release);
}
// Mark the specified file number as used.
@ -478,7 +480,7 @@ class VersionSet {
const InternalKeyComparator icmp_;
uint64_t next_file_number_;
uint64_t manifest_file_number_;
uint64_t last_sequence_;
std::atomic<uint64_t> last_sequence_;
uint64_t log_number_;
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted

@ -54,171 +54,204 @@ extern "C" {
/* Exported types */
typedef struct leveldb_t leveldb_t;
typedef struct leveldb_cache_t leveldb_cache_t;
typedef struct leveldb_comparator_t leveldb_comparator_t;
typedef struct leveldb_env_t leveldb_env_t;
typedef struct leveldb_filelock_t leveldb_filelock_t;
typedef struct leveldb_filterpolicy_t leveldb_filterpolicy_t;
typedef struct leveldb_iterator_t leveldb_iterator_t;
typedef struct leveldb_logger_t leveldb_logger_t;
typedef struct leveldb_options_t leveldb_options_t;
typedef struct leveldb_randomfile_t leveldb_randomfile_t;
typedef struct leveldb_readoptions_t leveldb_readoptions_t;
typedef struct leveldb_seqfile_t leveldb_seqfile_t;
typedef struct leveldb_snapshot_t leveldb_snapshot_t;
typedef struct leveldb_writablefile_t leveldb_writablefile_t;
typedef struct leveldb_writebatch_t leveldb_writebatch_t;
typedef struct leveldb_writeoptions_t leveldb_writeoptions_t;
typedef struct rocksdb_t rocksdb_t;
typedef struct rocksdb_cache_t rocksdb_cache_t;
typedef struct rocksdb_comparator_t rocksdb_comparator_t;
typedef struct rocksdb_env_t rocksdb_env_t;
typedef struct rocksdb_filelock_t rocksdb_filelock_t;
typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t;
typedef struct rocksdb_iterator_t rocksdb_iterator_t;
typedef struct rocksdb_logger_t rocksdb_logger_t;
typedef struct rocksdb_options_t rocksdb_options_t;
typedef struct rocksdb_randomfile_t rocksdb_randomfile_t;
typedef struct rocksdb_readoptions_t rocksdb_readoptions_t;
typedef struct rocksdb_seqfile_t rocksdb_seqfile_t;
typedef struct rocksdb_snapshot_t rocksdb_snapshot_t;
typedef struct rocksdb_writablefile_t rocksdb_writablefile_t;
typedef struct rocksdb_writebatch_t rocksdb_writebatch_t;
typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t;
typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t;
/* DB operations */
extern leveldb_t* leveldb_open(
const leveldb_options_t* options,
extern rocksdb_t* rocksdb_open(
const rocksdb_options_t* options,
const char* name,
char** errptr);
extern void leveldb_close(leveldb_t* db);
extern void rocksdb_close(rocksdb_t* db);
extern void leveldb_put(
leveldb_t* db,
const leveldb_writeoptions_t* options,
extern void rocksdb_put(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr);
extern void leveldb_delete(
leveldb_t* db,
const leveldb_writeoptions_t* options,
extern void rocksdb_delete(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
const char* key, size_t keylen,
char** errptr);
extern void leveldb_write(
leveldb_t* db,
const leveldb_writeoptions_t* options,
leveldb_writebatch_t* batch,
extern void rocksdb_write(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_writebatch_t* batch,
char** errptr);
/* Returns NULL if not found. A malloc()ed array otherwise.
Stores the length of the array in *vallen. */
extern char* leveldb_get(
leveldb_t* db,
const leveldb_readoptions_t* options,
extern char* rocksdb_get(
rocksdb_t* db,
const rocksdb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr);
extern leveldb_iterator_t* leveldb_create_iterator(
leveldb_t* db,
const leveldb_readoptions_t* options);
extern rocksdb_iterator_t* rocksdb_create_iterator(
rocksdb_t* db,
const rocksdb_readoptions_t* options);
extern const leveldb_snapshot_t* leveldb_create_snapshot(
leveldb_t* db);
extern const rocksdb_snapshot_t* rocksdb_create_snapshot(
rocksdb_t* db);
extern void leveldb_release_snapshot(
leveldb_t* db,
const leveldb_snapshot_t* snapshot);
extern void rocksdb_release_snapshot(
rocksdb_t* db,
const rocksdb_snapshot_t* snapshot);
/* Returns NULL if property name is unknown.
Else returns a pointer to a malloc()-ed null-terminated value. */
extern char* leveldb_property_value(
leveldb_t* db,
extern char* rocksdb_property_value(
rocksdb_t* db,
const char* propname);
extern void leveldb_approximate_sizes(
leveldb_t* db,
extern void rocksdb_approximate_sizes(
rocksdb_t* db,
int num_ranges,
const char* const* range_start_key, const size_t* range_start_key_len,
const char* const* range_limit_key, const size_t* range_limit_key_len,
uint64_t* sizes);
extern void leveldb_compact_range(
leveldb_t* db,
extern void rocksdb_compact_range(
rocksdb_t* db,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len);
/* Management operations */
extern void leveldb_destroy_db(
const leveldb_options_t* options,
extern void rocksdb_destroy_db(
const rocksdb_options_t* options,
const char* name,
char** errptr);
extern void leveldb_repair_db(
const leveldb_options_t* options,
extern void rocksdb_repair_db(
const rocksdb_options_t* options,
const char* name,
char** errptr);
/* Iterator */
extern void leveldb_iter_destroy(leveldb_iterator_t*);
extern unsigned char leveldb_iter_valid(const leveldb_iterator_t*);
extern void leveldb_iter_seek_to_first(leveldb_iterator_t*);
extern void leveldb_iter_seek_to_last(leveldb_iterator_t*);
extern void leveldb_iter_seek(leveldb_iterator_t*, const char* k, size_t klen);
extern void leveldb_iter_next(leveldb_iterator_t*);
extern void leveldb_iter_prev(leveldb_iterator_t*);
extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
extern void rocksdb_iter_destroy(rocksdb_iterator_t*);
extern unsigned char rocksdb_iter_valid(const rocksdb_iterator_t*);
extern void rocksdb_iter_seek_to_first(rocksdb_iterator_t*);
extern void rocksdb_iter_seek_to_last(rocksdb_iterator_t*);
extern void rocksdb_iter_seek(rocksdb_iterator_t*, const char* k, size_t klen);
extern void rocksdb_iter_next(rocksdb_iterator_t*);
extern void rocksdb_iter_prev(rocksdb_iterator_t*);
extern const char* rocksdb_iter_key(const rocksdb_iterator_t*, size_t* klen);
extern const char* rocksdb_iter_value(const rocksdb_iterator_t*, size_t* vlen);
extern void rocksdb_iter_get_error(const rocksdb_iterator_t*, char** errptr);
/* Write batch */
extern leveldb_writebatch_t* leveldb_writebatch_create();
extern void leveldb_writebatch_destroy(leveldb_writebatch_t*);
extern void leveldb_writebatch_clear(leveldb_writebatch_t*);
extern void leveldb_writebatch_put(
leveldb_writebatch_t*,
extern rocksdb_writebatch_t* rocksdb_writebatch_create();
extern void rocksdb_writebatch_destroy(rocksdb_writebatch_t*);
extern void rocksdb_writebatch_clear(rocksdb_writebatch_t*);
extern void rocksdb_writebatch_put(
rocksdb_writebatch_t*,
const char* key, size_t klen,
const char* val, size_t vlen);
extern void leveldb_writebatch_delete(
leveldb_writebatch_t*,
extern void rocksdb_writebatch_delete(
rocksdb_writebatch_t*,
const char* key, size_t klen);
extern void leveldb_writebatch_iterate(
leveldb_writebatch_t*,
extern void rocksdb_writebatch_iterate(
rocksdb_writebatch_t*,
void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
void (*deleted)(void*, const char* k, size_t klen));
/* Options */
extern leveldb_options_t* leveldb_options_create();
extern void leveldb_options_destroy(leveldb_options_t*);
extern void leveldb_options_set_comparator(
leveldb_options_t*,
leveldb_comparator_t*);
extern void leveldb_options_set_compression_per_level(
leveldb_options_t* opt,
extern rocksdb_options_t* rocksdb_options_create();
extern void rocksdb_options_destroy(rocksdb_options_t*);
extern void rocksdb_options_set_comparator(
rocksdb_options_t*,
rocksdb_comparator_t*);
extern void rocksdb_options_set_compression_per_level(
rocksdb_options_t* opt,
int* level_values,
size_t num_levels);
extern void leveldb_options_set_filter_policy(
leveldb_options_t*,
leveldb_filterpolicy_t*);
extern void leveldb_options_set_create_if_missing(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_error_if_exists(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_paranoid_checks(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
extern void leveldb_options_set_compression_options(
leveldb_options_t* opt, int w_bits, int level, int strategy);
extern void rocksdb_options_set_filter_policy(
rocksdb_options_t*,
rocksdb_filterpolicy_t*);
extern void rocksdb_options_set_create_if_missing(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_error_if_exists(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_paranoid_checks(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*);
extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*);
extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t);
extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int);
extern void rocksdb_options_set_compression_options(
rocksdb_options_t*, int, int, int);
extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int);
extern void rocksdb_options_set_level0_file_num_compaction_trigger(
rocksdb_options_t*, int);
extern void rocksdb_options_set_level0_slowdown_writes_trigger(
rocksdb_options_t*, int);
extern void rocksdb_options_set_level0_stop_writes_trigger(
rocksdb_options_t*, int);
extern void rocksdb_options_set_target_file_size_base(
rocksdb_options_t*, uint64_t);
extern void rocksdb_options_set_target_file_size_multiplier(
rocksdb_options_t*, int);
extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int);
extern void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
extern void rocksdb_options_set_max_background_compactions(rocksdb_options_t*, int);
extern void rocksdb_options_set_max_background_flushes(rocksdb_options_t*, int);
extern void rocksdb_options_set_use_fsync(
rocksdb_options_t*, int);
extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int);
extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int);
extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int);
extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int);
extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*);
extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*);
enum {
leveldb_no_compression = 0,
leveldb_snappy_compression = 1
rocksdb_no_compression = 0,
rocksdb_snappy_compression = 1,
rocksdb_zlib_compression = 1,
rocksdb_bz2_compression = 1
};
extern void leveldb_options_set_compression(leveldb_options_t*, int);
extern void rocksdb_options_set_compression(rocksdb_options_t*, int);
enum {
rocksdb_level_compaction = 0,
rocksdb_universal_compaction = 1
};
extern void rocksdb_options_set_compaction_style(rocksdb_options_t*, int);
extern void rocksdb_options_set_universal_compaction_options(rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
/* Comparator */
extern leveldb_comparator_t* leveldb_comparator_create(
extern rocksdb_comparator_t* rocksdb_comparator_create(
void* state,
void (*destructor)(void*),
int (*compare)(
@ -226,11 +259,11 @@ extern leveldb_comparator_t* leveldb_comparator_create(
const char* a, size_t alen,
const char* b, size_t blen),
const char* (*name)(void*));
extern void leveldb_comparator_destroy(leveldb_comparator_t*);
extern void rocksdb_comparator_destroy(rocksdb_comparator_t*);
/* Filter policy */
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create(
extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
void* state,
void (*destructor)(void*),
char* (*create_filter)(
@ -243,40 +276,65 @@ extern leveldb_filterpolicy_t* leveldb_filterpolicy_create(
const char* key, size_t length,
const char* filter, size_t filter_length),
const char* (*name)(void*));
extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*);
extern void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t*);
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(
extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(
int bits_per_key);
/* Read options */
extern leveldb_readoptions_t* leveldb_readoptions_create();
extern void leveldb_readoptions_destroy(leveldb_readoptions_t*);
extern void leveldb_readoptions_set_verify_checksums(
leveldb_readoptions_t*,
extern rocksdb_readoptions_t* rocksdb_readoptions_create();
extern void rocksdb_readoptions_destroy(rocksdb_readoptions_t*);
extern void rocksdb_readoptions_set_verify_checksums(
rocksdb_readoptions_t*,
unsigned char);
extern void leveldb_readoptions_set_fill_cache(
leveldb_readoptions_t*, unsigned char);
extern void leveldb_readoptions_set_snapshot(
leveldb_readoptions_t*,
const leveldb_snapshot_t*);
extern void rocksdb_readoptions_set_fill_cache(
rocksdb_readoptions_t*, unsigned char);
extern void rocksdb_readoptions_set_snapshot(
rocksdb_readoptions_t*,
const rocksdb_snapshot_t*);
/* Write options */
extern leveldb_writeoptions_t* leveldb_writeoptions_create();
extern void leveldb_writeoptions_destroy(leveldb_writeoptions_t*);
extern void leveldb_writeoptions_set_sync(
leveldb_writeoptions_t*, unsigned char);
extern rocksdb_writeoptions_t* rocksdb_writeoptions_create();
extern void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t*);
extern void rocksdb_writeoptions_set_sync(
rocksdb_writeoptions_t*, unsigned char);
extern void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable);
/* Cache */
extern leveldb_cache_t* leveldb_cache_create_lru(size_t capacity);
extern void leveldb_cache_destroy(leveldb_cache_t* cache);
extern rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity);
extern void rocksdb_cache_destroy(rocksdb_cache_t* cache);
/* Env */
extern leveldb_env_t* leveldb_create_default_env();
extern void leveldb_env_destroy(leveldb_env_t*);
extern rocksdb_env_t* rocksdb_create_default_env();
extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n);
extern void rocksdb_env_destroy(rocksdb_env_t*);
/* Universal Compaction options */
enum {
rocksdb_similar_size_compaction_stop_style = 0,
rocksdb_total_size_compaction_stop_style = 1
};
extern rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() ;
extern void rocksdb_universal_compaction_options_set_size_ratio(
rocksdb_universal_compaction_options_t*, int);
extern void rocksdb_universal_compaction_options_set_min_merge_width(
rocksdb_universal_compaction_options_t*, int);
extern void rocksdb_universal_compaction_options_set_max_merge_width(
rocksdb_universal_compaction_options_t*, int);
extern void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
rocksdb_universal_compaction_options_t*, int);
extern void rocksdb_universal_compaction_options_set_compression_size_percent(
rocksdb_universal_compaction_options_t*, int);
extern void rocksdb_universal_compaction_options_set_stop_style(
rocksdb_universal_compaction_options_t*, int);
extern void rocksdb_universal_compaction_options_destroy(
rocksdb_universal_compaction_options_t*);
#ifdef __cplusplus
} /* end extern "C" */

@ -228,6 +228,10 @@ class DB {
// Number of files in level-0 that would stop writes.
virtual int Level0StopWriteTrigger() = 0;
// Get DB name -- the exact same name that was provided as an argument to
// DB::Open()
virtual const std::string& GetName() const = 0;
// Get Env object from the DB
virtual Env* GetEnv() const = 0;

@ -111,52 +111,72 @@ enum Tickers {
BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache
WAL_FILE_SYNCED, // Number of times WAL sync is done
WAL_FILE_BYTES, // Number of bytes written to WAL
// Writes can be processed by requesting thread or by the thread at the
// head of the writers queue.
WRITE_DONE_BY_SELF,
WRITE_DONE_BY_OTHER,
WRITE_WITH_WAL, // Number of Write calls that request WAL
COMPACT_READ_BYTES, // Bytes read during compaction
COMPACT_WRITE_BYTES, // Bytes written during compaction
TICKER_ENUM_MAX
};
// The order of items listed in Tickers should be the same as
// the order listed in TickersNameMap
const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{ BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
{ BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
{ BLOCK_CACHE_ADD, "rocksdb.block.cache.add" },
{ BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss" },
{ BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit" },
{ BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss" },
{ BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit" },
{ BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss" },
{ BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit" },
{ BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" },
{ MEMTABLE_HIT, "rocksdb.memtable.hit" },
{ MEMTABLE_MISS, "rocksdb.memtable.miss" },
{ COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" },
{ COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" },
{ COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" },
{ NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" },
{ NUMBER_KEYS_READ, "rocksdb.number.keys.read" },
{ NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated" },
{ BYTES_WRITTEN, "rocksdb.bytes.written" },
{ BYTES_READ, "rocksdb.bytes.read" },
{ NO_FILE_CLOSES, "rocksdb.no.file.closes" },
{ NO_FILE_OPENS, "rocksdb.no.file.opens" },
{ NO_FILE_ERRORS, "rocksdb.no.file.errors" },
{ STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros" },
{ STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros" },
{ STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros" },
{ RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis" },
{ NO_ITERATORS, "rocksdb.num.iterators" },
{ NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" },
{ NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" },
{ NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" },
{ NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" },
{ NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" },
{ SEQUENCE_NUMBER, "rocksdb.sequence.number" },
{ BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
{ BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
{ NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" },
{ GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls" },
{ BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss" },
{ BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" },
{ BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
{ BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
{ BLOCK_CACHE_ADD, "rocksdb.block.cache.add" },
{ BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss" },
{ BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit" },
{ BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss" },
{ BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit" },
{ BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss" },
{ BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit" },
{ BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" },
{ MEMTABLE_HIT, "rocksdb.memtable.hit" },
{ MEMTABLE_MISS, "rocksdb.memtable.miss" },
{ COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" },
{ COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" },
{ COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" },
{ NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" },
{ NUMBER_KEYS_READ, "rocksdb.number.keys.read" },
{ NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated" },
{ BYTES_WRITTEN, "rocksdb.bytes.written" },
{ BYTES_READ, "rocksdb.bytes.read" },
{ NO_FILE_CLOSES, "rocksdb.no.file.closes" },
{ NO_FILE_OPENS, "rocksdb.no.file.opens" },
{ NO_FILE_ERRORS, "rocksdb.no.file.errors" },
{ STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros" },
{ STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros" },
{ STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros" },
{ RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis" },
{ NO_ITERATORS, "rocksdb.num.iterators" },
{ NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" },
{ NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" },
{ NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" },
{ NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" },
{ NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" },
{ SEQUENCE_NUMBER, "rocksdb.sequence.number" },
{ BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
{ BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
{ NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" },
{ GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls" },
{ BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss" },
{ BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" },
{ WAL_FILE_SYNCED, "rocksdb.wal.synced" },
{ WAL_FILE_BYTES, "rocksdb.wal.bytes" },
{ WRITE_DONE_BY_SELF, "rocksdb.write.self" },
{ WRITE_DONE_BY_OTHER, "rocksdb.write.other" },
{ WRITE_WITH_WAL, "rocksdb.write.wal" },
{ COMPACT_READ_BYTES, "rocksdb.compact.read.bytes" },
{ COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes" },
};
/**

@ -25,7 +25,7 @@ namespace rocksdb {
class Status {
public:
// Create a success status.
Status() : state_(nullptr) { }
Status() : code_(kOk), state_(nullptr) { }
~Status() { delete[] state_; }
// Copy the specified status.
@ -39,6 +39,10 @@ class Status {
static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kNotFound, msg, msg2);
}
// Fast path for not found without malloc;
static Status NotFound() {
return Status(kNotFound);
}
static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kCorruption, msg, msg2);
}
@ -59,7 +63,7 @@ class Status {
}
// Returns true iff the status indicates success.
bool ok() const { return (state_ == nullptr); }
bool ok() const { return code() == kOk; }
// Returns true iff the status indicates a NotFound error.
bool IsNotFound() const { return code() == kNotFound; }
@ -87,13 +91,6 @@ class Status {
std::string ToString() const;
private:
// OK status has a nullptr state_. Otherwise, state_ is a new[] array
// of the following form:
// state_[0..3] == length of message
// state_[4] == code
// state_[5..] == message
const char* state_;
enum Code {
kOk = 0,
kNotFound = 1,
@ -105,20 +102,30 @@ class Status {
kIncomplete = 7
};
// A nullptr state_ (which is always the case for OK) means the message
// is empty.
// of the following form:
// state_[0..3] == length of message
// state_[4..] == message
Code code_;
const char* state_;
Code code() const {
return (state_ == nullptr) ? kOk : static_cast<Code>(state_[4]);
return code_;
}
explicit Status(Code code) : code_(code), state_(nullptr) { }
Status(Code code, const Slice& msg, const Slice& msg2);
static const char* CopyState(const char* s);
};
inline Status::Status(const Status& s) {
code_ = s.code_;
state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
}
inline void Status::operator=(const Status& s) {
// The following condition catches both aliasing (when this == &s),
// and the common case where both s and *this are ok.
code_ = s.code_;
if (state_ != s.state_) {
delete[] state_;
state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);

@ -56,7 +56,7 @@ class LogFile {
};
struct BatchResult {
SequenceNumber sequence = SequenceNumber();
SequenceNumber sequence = 0;
std::unique_ptr<WriteBatch> writeBatchPtr;
};

@ -0,0 +1,133 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "utilities/stackable_db.h"
#include "rocksdb/env.h"
#include "rocksdb/status.h"
#include <string>
#include <map>
#include <vector>
namespace rocksdb {
struct BackupableDBOptions {
// Where to keep the backup files. Has to be different than dbname_
// Best to set this to dbname_ + "/backups"
// Required
std::string backup_dir;
// Backup Env object. It will be used for backup file I/O. If it's
// nullptr, backups will be written out using DBs Env. If it's
// non-nullptr, backup's I/O will be performed using this object.
// If you want to have backups on HDFS, use HDFS Env here!
// Default: nullptr
Env* backup_env;
// Backup info and error messages will be written to info_log
// if non-nullptr.
// Default: nullptr
Logger* info_log;
// If sync == true, we can guarantee you'll get consistent backup even
// on a machine crash/reboot. Backup process is slower with sync enabled.
// If sync == false, we don't guarantee anything on machine reboot. However,
// chances are some of the backups are consistent.
// Default: true
bool sync;
// If true, it will delete whatever backups there are already
// Default: false
bool destroy_old_data;
explicit BackupableDBOptions(const std::string& _backup_dir,
Env* _backup_env = nullptr,
Logger* _info_log = nullptr,
bool _sync = true,
bool _destroy_old_data = false) :
backup_dir(_backup_dir),
backup_env(_backup_env),
info_log(_info_log),
sync(_sync),
destroy_old_data(_destroy_old_data) { }
};
class BackupEngine;
typedef uint32_t BackupID;
struct BackupInfo {
BackupID backup_id;
int64_t timestamp;
uint64_t size;
BackupInfo() {}
BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size)
: backup_id(_backup_id), timestamp(_timestamp), size(_size) {}
};
// Stack your DB with BackupableDB to be able to backup the DB
class BackupableDB : public StackableDB {
public:
// BackupableDBOptions have to be the same as the ones used in a previous
// incarnation of the DB
//
// BackupableDB ownes the pointer `DB* db` now. You should not delete it or
// use it after the invocation of BackupableDB
BackupableDB(DB* db, const BackupableDBOptions& options);
virtual ~BackupableDB();
// Captures the state of the database in the latest backup
// NOT a thread safe call
Status CreateNewBackup(bool flush_before_backup = false);
// Returns info about backups in backup_info
void GetBackupInfo(std::vector<BackupInfo>* backup_info);
// deletes old backups, keeping latest num_backups_to_keep alive
Status PurgeOldBackups(uint32_t num_backups_to_keep);
// deletes a specific backup
Status DeleteBackup(BackupID backup_id);
private:
BackupEngine* backup_engine_;
};
// Use this class to access information about backups and restore from them
class RestoreBackupableDB {
public:
RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options);
~RestoreBackupableDB();
// Returns info about backups in backup_info
void GetBackupInfo(std::vector<BackupInfo>* backup_info);
// restore from backup with backup_id
// IMPORTANT -- if you restore from some backup that is not the latest,
// and you start creating new backups from the new DB, all the backups
// that were newer than the backup you restored from will be deleted
//
// Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
// If you try creating a new backup now, old backups 4 and 5 will be deleted
// and new backup with ID 4 will be created.
Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir,
const std::string& wal_dir);
// restore from the latest backup
Status RestoreDBFromLatestBackup(const std::string& db_dir,
const std::string& wal_dir);
// deletes old backups, keeping latest num_backups_to_keep alive
Status PurgeOldBackups(uint32_t num_backups_to_keep);
// deletes a specific backup
Status DeleteBackup(BackupID backup_id);
private:
BackupEngine* backup_engine_;
};
} // rocksdb namespace

@ -103,6 +103,10 @@ class StackableDB : public DB {
return db_->Level0StopWriteTrigger();
}
virtual const std::string& GetName() const override {
return db_->GetName();
}
virtual Env* GetEnv() const override {
return db_->GetEnv();
}

@ -31,12 +31,7 @@ static const char* GetExecutableName()
}
}
static void StackTraceHandler(int sig) {
// reset to default handler
signal(sig, SIG_DFL);
fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig));
void PrintStack(int first_frames_to_skip) {
const int kMaxFrames = 100;
void *frames[kMaxFrames];
@ -45,11 +40,8 @@ static void StackTraceHandler(int sig) {
auto executable = GetExecutableName();
const int kSkip = 2; // skip the top two signal handler related frames
for (int i = kSkip; i < num_frames; ++i)
{
fprintf(stderr, "#%-2d %p ", i - kSkip, frames[i]);
for (int i = first_frames_to_skip; i < num_frames; ++i) {
fprintf(stderr, "#%-2d ", i - first_frames_to_skip);
if (symbols) {
fprintf(stderr, "%s ", symbols[i]);
}
@ -57,22 +49,29 @@ static void StackTraceHandler(int sig) {
// out source to addr2line, for the address translation
const int kLineMax = 256;
char cmd[kLineMax];
sprintf(cmd,"addr2line %p -e %s 2>&1", frames[i] , executable);
sprintf(cmd, "addr2line %p -e %s -f -C 2>&1", frames[i], executable);
auto f = popen(cmd, "r");
if (f) {
char line[kLineMax];
while (fgets(line, sizeof(line), f)) {
fprintf(stderr, "%s", line);
line[strlen(line) - 1] = 0; // remove newline
fprintf(stderr, "%s\t", line);
}
pclose(f);
} else {
fprintf(stderr, "\n");
}
} else {
fprintf(stderr, "\n");
fprintf(stderr, " %p", frames[i]);
}
fprintf(stderr, "\n");
}
}
static void StackTraceHandler(int sig) {
// reset to default handler
signal(sig, SIG_DFL);
fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig));
// skip the top three signal handler related frames
PrintStack(3);
// re-signal to default handler (so we still get core dump if needed...)
raise(sig);
}
@ -96,6 +95,7 @@ void InstallStackTraceHandler() {
namespace rocksdb {
void InstallStackTraceHandler() {}
void PrintStack(int first_frames_to_skip) {}
}

@ -0,0 +1,329 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <algorithm>
#include <cassert>
#include <stdexcept>
#include <iterator>
#include <vector>
namespace rocksdb {
// A vector that leverages pre-allocated stack-based array to achieve better
// performance for array with small amount of items.
//
// The interface resembles that of vector, but with less features since we aim
// to solve the problem that we have in hand, rather than implementing a
// full-fledged generic container.
//
// Currently we don't support:
// * reserve()/shrink_to_fit()/resize()
// If used correctly, in most cases, people should not touch the
// underlying vector at all.
// * random insert()/erase(), please only use push_back()/pop_back().
// * No move/swap operations. Each autovector instance has a
// stack-allocated array and if we want support move/swap operations, we
// need to copy the arrays other than just swapping the pointers. In this
// case we'll just explicitly forbid these operations since they may
// lead users to make false assumption by thinking they are inexpensive
// operations.
//
// Naming style of public methods almost follows that of the STL's.
template <class T, size_t kSize = 8>
class autovector {
public:
// General STL-style container member types.
typedef T value_type;
typedef typename std::vector<T>::difference_type difference_type;
typedef typename std::vector<T>::size_type size_type;
typedef value_type& reference;
typedef const value_type& const_reference;
typedef value_type* pointer;
typedef const value_type* const_pointer;
// This class is the base for regular/const iterator
template <class TAutoVector, class TValueType>
class iterator_impl {
public:
// -- iterator traits
typedef iterator_impl<TAutoVector, TValueType> self_type;
typedef TValueType value_type;
typedef TValueType& reference;
typedef TValueType* pointer;
typedef typename TAutoVector::difference_type difference_type;
typedef std::random_access_iterator_tag iterator_category;
iterator_impl(TAutoVector* vect, size_t index)
: vect_(vect)
, index_(index) {
};
iterator_impl(const iterator_impl&) = default;
~iterator_impl() { }
iterator_impl& operator=(const iterator_impl&) = default;
// -- Advancement
// iterator++
self_type& operator++() {
++index_;
return *this;
}
// ++iterator
self_type operator++(int) {
auto old = *this;
++index_;
return old;
}
// iterator--
self_type& operator--() {
--index_;
return *this;
}
// --iterator
self_type operator--(int) {
auto old = *this;
--index_;
return old;
}
self_type operator-(difference_type len) {
return self_type(vect_, index_ - len);
}
difference_type operator-(const self_type& other) {
assert(vect_ == other.vect_);
return index_ - other.index_;
}
self_type operator+(difference_type len) {
return self_type(vect_, index_ + len);
}
self_type& operator+=(difference_type len) {
index_ += len;
return *this;
}
self_type& operator-=(difference_type len) {
index_ -= len;
return *this;
}
// -- Reference
reference operator*() {
assert(vect_->size() >= index_);
return (*vect_)[index_];
}
pointer operator->() {
assert(vect_->size() >= index_);
return &(*vect_)[index_];
}
// -- Logical Operators
bool operator==(const self_type& other) const {
assert(vect_ == other.vect_);
return index_ == other.index_;
}
bool operator!=(const self_type& other) const {
return !(*this == other);
}
bool operator>(const self_type& other) const {
assert(vect_ == other.vect_);
return index_ > other.index_;
}
bool operator<(const self_type& other) const {
assert(vect_ == other.vect_);
return index_ < other.index_;
}
bool operator>=(const self_type& other) const {
assert(vect_ == other.vect_);
return index_ >= other.index_;
}
bool operator<=(const self_type& other) const {
assert(vect_ == other.vect_);
return index_ <= other.index_;
}
private:
TAutoVector* vect_ = nullptr;
size_t index_ = 0;
};
typedef iterator_impl<autovector, value_type> iterator;
typedef iterator_impl<const autovector, const value_type> const_iterator;
typedef std::reverse_iterator<iterator> reverse_iterator;
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
autovector() = default;
~autovector() = default;
// -- Immutable operations
// Indicate if all data resides in in-stack data structure.
bool only_in_stack() const {
// If no element was inserted at all, the vector's capacity will be `0`.
return vect_.capacity() == 0;
}
size_type size() const {
return num_stack_items_ + vect_.size();
}
bool empty() const {
return size() == 0;
}
// will not check boundry
const_reference operator[](size_type n) const {
return n < kSize ? values_[n] : vect_[n - kSize];
}
reference operator[](size_type n) {
return n < kSize ? values_[n] : vect_[n - kSize];
}
// will check boundry
const_reference at(size_type n) const {
if (n >= size()) {
throw std::out_of_range("autovector: index out of range");
}
return (*this)[n];
}
reference at(size_type n) {
if (n >= size()) {
throw std::out_of_range("autovector: index out of range");
}
return (*this)[n];
}
reference front() {
assert(!empty());
return *begin();
}
const_reference front() const {
assert(!empty());
return *begin();
}
reference back() {
assert(!empty());
return *(end() - 1);
}
const_reference back() const {
assert(!empty());
return *(end() - 1);
}
// -- Mutable Operations
void push_back(T&& item) {
if (num_stack_items_ < kSize) {
values_[num_stack_items_++] = std::move(item);
} else {
vect_.push_back(item);
}
}
void push_back(const T& item) {
push_back(value_type(item));
}
template<class... Args>
void emplace_back(Args&&... args) {
push_back(value_type(args...));
}
void pop_back() {
assert(!empty());
if (!vect_.empty()) {
vect_.pop_back();
} else {
--num_stack_items_;
}
}
void clear() {
num_stack_items_ = 0;
vect_.clear();
}
// -- Copy and Assignment
autovector& assign(const autovector& other);
autovector(const autovector& other) {
assign(other);
}
autovector& operator=(const autovector& other) {
return assign(other);
}
// move operation are disallowed since it is very hard to make sure both
// autovectors are allocated from the same function stack.
autovector& operator=(autovector&& other) = delete;
autovector(autovector&& other) = delete;
// -- Iterator Operations
iterator begin() {
return iterator(this, 0);
}
const_iterator begin() const {
return const_iterator(this, 0);
}
iterator end() {
return iterator(this, this->size());
}
const_iterator end() const {
return const_iterator(this, this->size());
}
reverse_iterator rbegin() {
return reverse_iterator(end());
}
const_reverse_iterator rbegin() const {
return const_reverse_iterator(end());
}
reverse_iterator rend() {
return reverse_iterator(begin());
}
const_reverse_iterator rend() const {
return const_reverse_iterator(begin());
}
private:
size_type num_stack_items_ = 0; // current number of items
value_type values_[kSize]; // the first `kSize` items
// used only if there are more than `kSize` items.
std::vector<T> vect_;
};
template <class T, size_t kSize>
autovector<T, kSize>& autovector<T, kSize>::assign(const autovector& other) {
// copy the internal vector
vect_.assign(other.vect_.begin(), other.vect_.end());
// copy array
num_stack_items_ = other.num_stack_items_;
std::copy(other.values_, other.values_ + num_stack_items_, values_);
return *this;
}
} // rocksdb

@ -0,0 +1,290 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <atomic>
#include <iostream>
#include "rocksdb/env.h"
#include "util/autovector.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace rocksdb {
using namespace std;
class AutoVectorTest { };
const size_t kSize = 8;
TEST(AutoVectorTest, PushBackAndPopBack) {
autovector<size_t, kSize> vec;
ASSERT_TRUE(vec.empty());
ASSERT_EQ(0ul, vec.size());
for (size_t i = 0; i < 1000 * kSize; ++i) {
vec.push_back(i);
ASSERT_TRUE(!vec.empty());
if (i < kSize) {
ASSERT_TRUE(vec.only_in_stack());
} else {
ASSERT_TRUE(!vec.only_in_stack());
}
ASSERT_EQ(i + 1, vec.size());
ASSERT_EQ(i, vec[i]);
ASSERT_EQ(i, vec.at(i));
}
size_t size = vec.size();
while (size != 0) {
vec.pop_back();
// will always be in heap
ASSERT_TRUE(!vec.only_in_stack());
ASSERT_EQ(--size, vec.size());
}
ASSERT_TRUE(vec.empty());
}
TEST(AutoVectorTest, EmplaceBack) {
typedef std::pair<size_t, std::string> ValueType;
autovector<ValueType, kSize> vec;
for (size_t i = 0; i < 1000 * kSize; ++i) {
vec.emplace_back(i, std::to_string(i + 123));
ASSERT_TRUE(!vec.empty());
if (i < kSize) {
ASSERT_TRUE(vec.only_in_stack());
} else {
ASSERT_TRUE(!vec.only_in_stack());
}
ASSERT_EQ(i + 1, vec.size());
ASSERT_EQ(i, vec[i].first);
ASSERT_EQ(std::to_string(i + 123), vec[i].second);
}
vec.clear();
ASSERT_TRUE(vec.empty());
ASSERT_TRUE(!vec.only_in_stack());
}
void AssertEqual(
const autovector<size_t, kSize>& a, const autovector<size_t, kSize>& b) {
ASSERT_EQ(a.size(), b.size());
ASSERT_EQ(a.empty(), b.empty());
ASSERT_EQ(a.only_in_stack(), b.only_in_stack());
for (size_t i = 0; i < a.size(); ++i) {
ASSERT_EQ(a[i], b[i]);
}
}
TEST(AutoVectorTest, CopyAndAssignment) {
// Test both heap-allocated and stack-allocated cases.
for (auto size : { kSize / 2, kSize * 1000 }) {
autovector<size_t, kSize> vec;
for (size_t i = 0; i < size; ++i) {
vec.push_back(i);
}
{
autovector<size_t, kSize> other;
other = vec;
AssertEqual(other, vec);
}
{
autovector<size_t, kSize> other(vec);
AssertEqual(other, vec);
}
}
}
TEST(AutoVectorTest, Iterators) {
autovector<std::string, kSize> vec;
for (size_t i = 0; i < kSize * 1000; ++i) {
vec.push_back(std::to_string(i));
}
// basic operator test
ASSERT_EQ(vec.front(), *vec.begin());
ASSERT_EQ(vec.back(), *(vec.end() - 1));
ASSERT_TRUE(vec.begin() < vec.end());
// non-const iterator
size_t index = 0;
for (const auto& item : vec) {
ASSERT_EQ(vec[index++], item);
}
index = vec.size() - 1;
for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) {
ASSERT_EQ(vec[index--], *pos);
}
// const iterator
const auto& cvec = vec;
index = 0;
for (const auto& item : cvec) {
ASSERT_EQ(cvec[index++], item);
}
index = vec.size() - 1;
for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) {
ASSERT_EQ(cvec[index--], *pos);
}
// forward and backward
auto pos = vec.begin();
while (pos != vec.end()) {
auto old_val = *pos;
auto old = pos++;
// HACK: make sure -> works
ASSERT_TRUE(!old->empty());
ASSERT_EQ(old_val, *old);
ASSERT_TRUE(pos == vec.end() || old_val != *pos);
}
pos = vec.begin();
for (size_t i = 0; i < vec.size(); i += 2) {
// Cannot use ASSERT_EQ since that macro depends on iostream serialization
ASSERT_TRUE(pos + 2 - 2 == pos);
pos += 2;
ASSERT_TRUE(pos >= vec.begin());
ASSERT_TRUE(pos <= vec.end());
size_t diff = static_cast<size_t>(pos - vec.begin());
ASSERT_EQ(i + 2, diff);
}
}
vector<string> GetTestKeys(size_t size) {
vector<string> keys;
keys.resize(size);
int index = 0;
for (auto& key : keys) {
key = "item-" + to_string(index++);
}
return keys;
}
template<class TVector>
void BenchmarkVectorCreationAndInsertion(
string name, size_t ops, size_t item_size,
const std::vector<typename TVector::value_type>& items) {
auto env = Env::Default();
int index = 0;
auto start_time = env->NowNanos();
auto ops_remaining = ops;
while(ops_remaining--) {
TVector v;
for (size_t i = 0; i < item_size; ++i) {
v.push_back(items[index++]);
}
}
auto elapsed = env->NowNanos() - start_time;
cout << "created " << ops << " " << name << " instances:\n\t"
<< "each was inserted with " << item_size << " elements\n\t"
<< "total time elapsed: " << elapsed << " (ns)" << endl;
}
template <class TVector>
size_t BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) {
TVector v;
for (const auto& item : GetTestKeys(elem_size)) {
v.push_back(item);
}
auto env = Env::Default();
auto ops_remaining = ops;
auto start_time = env->NowNanos();
size_t total = 0;
while (ops_remaining--) {
auto end = v.end();
for (auto pos = v.begin(); pos != end; ++pos) {
total += pos->size();
}
}
auto elapsed = env->NowNanos() - start_time;
cout << "performed " << ops << " sequence access against " << name << "\n\t"
<< "size: " << elem_size << "\n\t"
<< "total time elapsed: " << elapsed << " (ns)" << endl;
// HACK avoid compiler's optimization to ignore total
return total;
}
// This test case only reports the performance between std::vector<string>
// and autovector<string>. We chose string for comparison because in most
// o our use cases we used std::vector<string>.
TEST(AutoVectorTest, PerfBench) {
// We run same operations for kOps times in order to get a more fair result.
size_t kOps = 100000;
// Creation and insertion test
// Test the case when there is:
// * no element inserted: internal array of std::vector may not really get
// initialize.
// * one element inserted: internal array of std::vector must have
// initialized.
// * kSize elements inserted. This shows the most time we'll spend if we
// keep everything in stack.
// * 2 * kSize elements inserted. The internal vector of
// autovector must have been initialized.
cout << "=====================================================" << endl;
cout << "Creation and Insertion Test (value type: std::string)" << endl;
cout << "=====================================================" << endl;
// pre-generated unique keys
auto string_keys = GetTestKeys(kOps * 2 * kSize);
for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) {
BenchmarkVectorCreationAndInsertion<vector<string>>(
"vector<string>", kOps, insertions, string_keys
);
BenchmarkVectorCreationAndInsertion<autovector<string, kSize>>(
"autovector<string>", kOps, insertions, string_keys
);
cout << "-----------------------------------" << endl;
}
cout << "=====================================================" << endl;
cout << "Creation and Insertion Test (value type: uint64_t)" << endl;
cout << "=====================================================" << endl;
// pre-generated unique keys
vector<uint64_t> int_keys(kOps * 2 * kSize);
for (size_t i = 0; i < kOps * 2 * kSize; ++i) {
int_keys[i] = i;
}
for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) {
BenchmarkVectorCreationAndInsertion<vector<uint64_t>>(
"vector<uint64_t>", kOps, insertions, int_keys
);
BenchmarkVectorCreationAndInsertion<autovector<uint64_t, kSize>>(
"autovector<uint64_t>", kOps, insertions, int_keys
);
cout << "-----------------------------------" << endl;
}
// Sequence Access Test
cout << "=====================================================" << endl;
cout << "Sequence Access Test" << endl;
cout << "=====================================================" << endl;
for (auto elem_size : { kSize / 2, kSize, 2 * kSize }) {
BenchmarkSequenceAccess<vector<string>>(
"vector", kOps, elem_size
);
BenchmarkSequenceAccess<autovector<string, kSize>>(
"autovector", kOps, elem_size
);
cout << "-----------------------------------" << endl;
}
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

@ -10,7 +10,7 @@
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <list>
#include <vector>
#include "rocksdb/cache.h"
#include "port/port.h"
@ -111,8 +111,8 @@ class HandleTable {
}
void Resize() {
uint32_t new_length = 4;
while (new_length < elems_) {
uint32_t new_length = 16;
while (new_length < elems_ * 1.5) {
new_length *= 2;
}
LRUHandle** new_list = new LRUHandle*[new_length];
@ -264,18 +264,20 @@ Cache::Handle* LRUCache::Insert(
LRUHandle* e = reinterpret_cast<LRUHandle*>(
malloc(sizeof(LRUHandle)-1 + key.size()));
std::list<LRUHandle*> last_reference_list;
std::vector<LRUHandle*> last_reference_list;
last_reference_list.reserve(1);
e->value = value;
e->deleter = deleter;
e->charge = charge;
e->key_length = key.size();
e->hash = hash;
e->refs = 2; // One from LRUCache, one for the returned handle
memcpy(e->key_data, key.data(), key.size());
{
MutexLock l(&mutex_);
e->value = value;
e->deleter = deleter;
e->charge = charge;
e->key_length = key.size();
e->hash = hash;
e->refs = 2; // One from LRUCache, one for the returned handle
memcpy(e->key_data, key.data(), key.size());
LRU_Append(e);
LRUHandle* old = table_.Insert(e);

@ -217,6 +217,17 @@ Slice GetLengthPrefixedSlice(const char* data) {
return Slice(p, len);
}
Slice GetSliceUntil(Slice* slice, char delimiter) {
uint32_t len;
for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
// nothing
}
Slice ret(slice->data(), len);
slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
return ret;
}
void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
uint32_t bits, uint64_t value) {
assert((offset + bits + 7)/8 <= dstlen);

@ -40,6 +40,8 @@ extern bool GetVarint64(Slice* input, uint64_t* value);
extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
extern Slice GetLengthPrefixedSlice(const char* data);
extern Slice GetSliceUntil(Slice* slice, char delimiter);
// Pointer-based variants of GetVarint... These either store a value
// in *v and return a pointer just past the parsed value, or return
// nullptr on error. These routines only look at bytes in the range

@ -395,7 +395,7 @@ class PosixMmapFile : public WritableFile {
}
Status MapNewRegion() {
#ifdef OS_LINUX
#ifdef ROCKSDB_FALLOCATE_PRESENT
assert(base_ == nullptr);
TEST_KILL_RANDOM(rocksdb_kill_odds);
@ -581,7 +581,7 @@ class PosixMmapFile : public WritableFile {
#endif
}
#ifdef OS_LINUX
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) {
TEST_KILL_RANDOM(rocksdb_kill_odds);
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
@ -758,7 +758,7 @@ class PosixWritableFile : public WritableFile {
#endif
}
#ifdef OS_LINUX
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) {
TEST_KILL_RANDOM(rocksdb_kill_odds);
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
@ -862,7 +862,7 @@ class PosixRandomRWFile : public RandomRWFile {
return Status::OK();
}
#ifdef OS_LINUX
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) {
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
return Status::OK();
@ -1303,7 +1303,7 @@ class PosixEnv : public Env {
}
bool SupportsFastAllocate(const std::string& path) {
#ifdef OS_LINUX
#ifdef ROCKSDB_FALLOCATE_PRESENT
struct statfs s;
if (statfs(path.c_str(), &s)){
return false;

@ -36,15 +36,19 @@ class PosixLogger : public Logger {
const static uint64_t flush_every_seconds_ = 5;
std::atomic_uint_fast64_t last_flush_micros_;
Env* env_;
bool flush_pending_;
public:
PosixLogger(FILE* f, uint64_t (*gettid)(), Env* env) :
file_(f), gettid_(gettid), log_size_(0), fd_(fileno(f)),
last_flush_micros_(0), env_(env) { }
last_flush_micros_(0), env_(env), flush_pending_(false) { }
virtual ~PosixLogger() {
fclose(file_);
}
virtual void Flush() {
fflush(file_);
if (flush_pending_) {
flush_pending_ = false;
fflush(file_);
}
last_flush_micros_ = env_->NowMicros();
}
virtual void Logv(const char* format, va_list ap) {
@ -107,7 +111,7 @@ class PosixLogger : public Logger {
assert(p <= limit);
const size_t write_size = p - base;
#ifdef OS_LINUX
#ifdef ROCKSDB_FALLOCATE_PRESENT
// If this write would cross a boundary of kDebugLogChunkSize
// space, pre-allocate more space to avoid overly large
// allocations from filesystem allocsize options.
@ -124,6 +128,7 @@ class PosixLogger : public Logger {
#endif
size_t sz = fwrite(base, 1, write_size, file_);
flush_pending_ = true;
assert(sz == write_size);
if (sz > 0) {
log_size_ += write_size;
@ -131,6 +136,7 @@ class PosixLogger : public Logger {
uint64_t now_micros = static_cast<uint64_t>(now_tv.tv_sec) * 1000000 +
now_tv.tv_usec;
if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
flush_pending_ = false;
fflush(file_);
last_flush_micros_ = now_micros;
}

@ -11,4 +11,7 @@ namespace rocksdb {
// Currently supports linux only. No-op otherwise.
void InstallStackTraceHandler();
// Prints stack, skips skip_first_frames frames
void PrintStack(int first_frames_to_skip = 0);
} // namespace rocksdb

@ -16,68 +16,65 @@ namespace rocksdb {
const char* Status::CopyState(const char* state) {
uint32_t size;
memcpy(&size, state, sizeof(size));
char* result = new char[size + 5];
memcpy(result, state, size + 5);
char* result = new char[size + 4];
memcpy(result, state, size + 4);
return result;
}
Status::Status(Code code, const Slice& msg, const Slice& msg2) {
Status::Status(Code code, const Slice& msg, const Slice& msg2) :
code_(code) {
assert(code != kOk);
const uint32_t len1 = msg.size();
const uint32_t len2 = msg2.size();
const uint32_t size = len1 + (len2 ? (2 + len2) : 0);
char* result = new char[size + 5];
char* result = new char[size + 4];
memcpy(result, &size, sizeof(size));
result[4] = static_cast<char>(code);
memcpy(result + 5, msg.data(), len1);
memcpy(result + 4, msg.data(), len1);
if (len2) {
result[5 + len1] = ':';
result[6 + len1] = ' ';
memcpy(result + 7 + len1, msg2.data(), len2);
result[4 + len1] = ':';
result[5 + len1] = ' ';
memcpy(result + 6 + len1, msg2.data(), len2);
}
state_ = result;
}
std::string Status::ToString() const {
if (state_ == nullptr) {
return "OK";
} else {
char tmp[30];
const char* type;
switch (code()) {
case kOk:
type = "OK";
break;
case kNotFound:
type = "NotFound: ";
break;
case kCorruption:
type = "Corruption: ";
break;
case kNotSupported:
type = "Not implemented: ";
break;
case kInvalidArgument:
type = "Invalid argument: ";
break;
case kIOError:
type = "IO error: ";
break;
case kMergeInProgress:
type = "Merge In Progress: ";
break;
default:
snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
static_cast<int>(code()));
type = tmp;
break;
}
std::string result(type);
char tmp[30];
const char* type;
switch (code_) {
case kOk:
return "OK";
case kNotFound:
type = "NotFound: ";
break;
case kCorruption:
type = "Corruption: ";
break;
case kNotSupported:
type = "Not implemented: ";
break;
case kInvalidArgument:
type = "Invalid argument: ";
break;
case kIOError:
type = "IO error: ";
break;
case kMergeInProgress:
type = "Merge In Progress: ";
break;
default:
snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
static_cast<int>(code()));
type = tmp;
break;
}
std::string result(type);
if (state_ != nullptr) {
uint32_t length;
memcpy(&length, state_, sizeof(length));
result.append(state_ + 5, length);
return result;
result.append(state_ + 4, length);
}
return result;
}
} // namespace rocksdb

@ -15,6 +15,7 @@
#include "rocksdb/env.h"
#include "rocksdb/slice.h"
#include "util/random.h"
#include "util/stack_trace.h"
namespace rocksdb {
namespace test {
@ -58,6 +59,7 @@ class Tester {
~Tester() {
if (!ok_) {
fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str());
PrintStack(2);
exit(1);
}
}

BIN
utilities/.DS_Store vendored

Binary file not shown.

@ -0,0 +1,874 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "utilities/backupable_db.h"
#include "db/filename.h"
#include "util/coding.h"
#include "rocksdb/transaction_log.h"
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <algorithm>
#include <vector>
#include <map>
#include <string>
#include <limits>
namespace rocksdb {
// -------- BackupEngine class ---------
class BackupEngine {
public:
BackupEngine(Env* db_env, const BackupableDBOptions& options);
~BackupEngine();
Status CreateNewBackup(DB* db, bool flush_before_backup = false);
Status PurgeOldBackups(uint32_t num_backups_to_keep);
Status DeleteBackup(BackupID backup_id);
void GetBackupInfo(std::vector<BackupInfo>* backup_info);
Status RestoreDBFromBackup(BackupID backup_id, const std::string &db_dir,
const std::string &wal_dir);
Status RestoreDBFromLatestBackup(const std::string &db_dir,
const std::string &wal_dir) {
return RestoreDBFromBackup(latest_backup_id_, db_dir, wal_dir);
}
void DeleteBackupsNewerThan(uint64_t sequence_number);
private:
class BackupMeta {
public:
BackupMeta(const std::string& meta_filename,
std::unordered_map<std::string, int>* file_refs, Env* env)
: timestamp_(0), size_(0), meta_filename_(meta_filename),
file_refs_(file_refs), env_(env) {}
~BackupMeta() {}
void RecordTimestamp() {
env_->GetCurrentTime(&timestamp_);
}
int64_t GetTimestamp() const {
return timestamp_;
}
uint64_t GetSize() const {
return size_;
}
void SetSequenceNumber(uint64_t sequence_number) {
sequence_number_ = sequence_number;
}
uint64_t GetSequenceNumber() {
return sequence_number_;
}
void AddFile(const std::string& filename, uint64_t size);
void Delete();
bool Empty() {
return files_.empty();
}
const std::vector<std::string>& GetFiles() {
return files_;
}
Status LoadFromFile(const std::string& backup_dir);
Status StoreToFile(bool sync);
private:
int64_t timestamp_;
// sequence number is only approximate, should not be used
// by clients
uint64_t sequence_number_;
uint64_t size_;
std::string const meta_filename_;
// files with relative paths (without "/" prefix!!)
std::vector<std::string> files_;
std::unordered_map<std::string, int>* file_refs_;
Env* env_;
static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024; // 10MB
}; // BackupMeta
inline std::string GetAbsolutePath(
const std::string &relative_path = "") const {
assert(relative_path.size() == 0 || relative_path[0] != '/');
return options_.backup_dir + "/" + relative_path;
}
inline std::string GetPrivateDirRel() const {
return "private";
}
inline std::string GetPrivateFileRel(BackupID backup_id,
const std::string &file = "") const {
assert(file.size() == 0 || file[0] != '/');
return GetPrivateDirRel() + "/" + std::to_string(backup_id) + "/" + file;
}
inline std::string GetSharedFileRel(const std::string& file = "") const {
assert(file.size() == 0 || file[0] != '/');
return "shared/" + file;
}
inline std::string GetLatestBackupFile(bool tmp = false) const {
return GetAbsolutePath(std::string("LATEST_BACKUP") + (tmp ? ".tmp" : ""));
}
inline std::string GetBackupMetaDir() const {
return GetAbsolutePath("meta");
}
inline std::string GetBackupMetaFile(BackupID backup_id) const {
return GetBackupMetaDir() + "/" + std::to_string(backup_id);
}
Status GetLatestBackupFileContents(uint32_t* latest_backup);
Status PutLatestBackupFileContents(uint32_t latest_backup);
// if size_limit == 0, there is no size limit, copy everything
Status CopyFile(const std::string& src,
const std::string& dst,
Env* src_env,
Env* dst_env,
bool sync,
uint64_t* size = nullptr,
uint64_t size_limit = 0);
// if size_limit == 0, there is no size limit, copy everything
Status BackupFile(BackupID backup_id,
BackupMeta* backup,
bool shared,
const std::string& src_dir,
const std::string& src_fname, // starts with "/"
uint64_t size_limit = 0);
// Will delete all the files we don't need anymore
// If full_scan == true, it will do the full scan of files/ directory
// and delete all the files that are not referenced from backuped_file_refs_
void GarbageCollection(bool full_scan);
// backup state data
BackupID latest_backup_id_;
std::map<BackupID, BackupMeta> backups_;
std::unordered_map<std::string, int> backuped_file_refs_;
std::vector<BackupID> obsolete_backups_;
// options data
BackupableDBOptions options_;
Env* db_env_;
Env* backup_env_;
static const size_t copy_file_buffer_size_ = 5 * 1024 * 1024LL; // 5MB
};
BackupEngine::BackupEngine(Env* db_env, const BackupableDBOptions& options)
: options_(options),
db_env_(db_env),
backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_) {
// create all the dirs we need
backup_env_->CreateDirIfMissing(GetAbsolutePath());
backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel()));
backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel()));
backup_env_->CreateDirIfMissing(GetBackupMetaDir());
std::vector<std::string> backup_meta_files;
backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files);
// create backups_ structure
for (auto& file : backup_meta_files) {
BackupID backup_id = 0;
sscanf(file.c_str(), "%u", &backup_id);
if (backup_id == 0 || file != std::to_string(backup_id)) {
// invalid file name, delete that
backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file);
continue;
}
assert(backups_.find(backup_id) == backups_.end());
backups_.insert(std::make_pair(
backup_id, BackupMeta(GetBackupMetaFile(backup_id),
&backuped_file_refs_, backup_env_)));
}
if (options_.destroy_old_data) { // Destory old data
for (auto& backup : backups_) {
backup.second.Delete();
obsolete_backups_.push_back(backup.first);
}
backups_.clear();
// start from beginning
latest_backup_id_ = 0;
// GarbageCollection() will do the actual deletion
} else { // Load data from storage
// load the backups if any
for (auto& backup : backups_) {
Status s = backup.second.LoadFromFile(options_.backup_dir);
if (!s.ok()) {
Log(options_.info_log, "Backup %u corrupted - deleting -- %s",
backup.first, s.ToString().c_str());
backup.second.Delete();
obsolete_backups_.push_back(backup.first);
}
}
// delete obsolete backups from the structure
for (auto ob : obsolete_backups_) {
backups_.erase(ob);
}
Status s = GetLatestBackupFileContents(&latest_backup_id_);
// If latest backup file is corrupted or non-existent
// set latest backup as the biggest backup we have
// or 0 if we have no backups
if (!s.ok() ||
backups_.find(latest_backup_id_) == backups_.end()) {
auto itr = backups_.end();
latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first;
}
}
// delete any backups that claim to be later than latest
for (auto itr = backups_.upper_bound(latest_backup_id_);
itr != backups_.end();) {
itr->second.Delete();
obsolete_backups_.push_back(itr->first);
itr = backups_.erase(itr);
}
PutLatestBackupFileContents(latest_backup_id_); // Ignore errors
GarbageCollection(true);
Log(options_.info_log,
"Initialized BackupEngine, the latest backup is %u.",
latest_backup_id_);
}
BackupEngine::~BackupEngine() {
LogFlush(options_.info_log);
}
void BackupEngine::DeleteBackupsNewerThan(uint64_t sequence_number) {
for (auto backup : backups_) {
if (backup.second.GetSequenceNumber() > sequence_number) {
Log(options_.info_log,
"Deleting backup %u because sequence number (%" PRIu64
") is newer than %" PRIu64 "",
backup.first, backup.second.GetSequenceNumber(), sequence_number);
backup.second.Delete();
obsolete_backups_.push_back(backup.first);
}
}
for (auto ob : obsolete_backups_) {
backups_.erase(backups_.find(ob));
}
auto itr = backups_.end();
latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first;
PutLatestBackupFileContents(latest_backup_id_); // Ignore errors
GarbageCollection(false);
}
Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) {
Status s;
std::vector<std::string> live_files;
VectorLogPtr live_wal_files;
uint64_t manifest_file_size = 0;
uint64_t sequence_number = db->GetLatestSequenceNumber();
s = db->DisableFileDeletions();
if (s.ok()) {
// this will return live_files prefixed with "/"
s = db->GetLiveFiles(live_files, &manifest_file_size, flush_before_backup);
}
// if we didn't flush before backup, we need to also get WAL files
if (s.ok() && !flush_before_backup) {
// returns file names prefixed with "/"
s = db->GetSortedWalFiles(live_wal_files);
}
if (!s.ok()) {
db->EnableFileDeletions();
return s;
}
BackupID new_backup_id = latest_backup_id_ + 1;
assert(backups_.find(new_backup_id) == backups_.end());
auto ret = backups_.insert(std::make_pair(
new_backup_id, BackupMeta(GetBackupMetaFile(new_backup_id),
&backuped_file_refs_, backup_env_)));
assert(ret.second == true);
auto& new_backup = ret.first->second;
new_backup.RecordTimestamp();
new_backup.SetSequenceNumber(sequence_number);
Log(options_.info_log, "Started the backup process -- creating backup %u",
new_backup_id);
// create private dir
s = backup_env_->CreateDir(GetAbsolutePath(GetPrivateFileRel(new_backup_id)));
// copy live_files
for (size_t i = 0; s.ok() && i < live_files.size(); ++i) {
uint64_t number;
FileType type;
bool ok = ParseFileName(live_files[i], &number, &type);
if (!ok) {
assert(false);
return Status::Corruption("Can't parse file name. This is very bad");
}
// we should only get sst, manifest and current files here
assert(type == kTableFile ||
type == kDescriptorFile ||
type == kCurrentFile);
// rules:
// * if it's kTableFile, than it's shared
// * if it's kDescriptorFile, limit the size to manifest_file_size
s = BackupFile(new_backup_id,
&new_backup,
type == kTableFile, /* shared */
db->GetName(), /* src_dir */
live_files[i], /* src_fname */
(type == kDescriptorFile) ? manifest_file_size : 0);
}
// copy WAL files
for (size_t i = 0; s.ok() && i < live_wal_files.size(); ++i) {
if (live_wal_files[i]->Type() == kAliveLogFile) {
// we only care about live log files
// copy the file into backup_dir/files/<new backup>/
s = BackupFile(new_backup_id,
&new_backup,
false, /* not shared */
db->GetOptions().wal_dir,
live_wal_files[i]->PathName());
}
}
// we copied all the files, enable file deletions
db->EnableFileDeletions();
if (s.ok()) {
// persist the backup metadata on the disk
s = new_backup.StoreToFile(options_.sync);
}
if (s.ok()) {
// install the newly created backup meta! (atomic)
s = PutLatestBackupFileContents(new_backup_id);
}
if (!s.ok()) {
// clean all the files we might have created
Log(options_.info_log, "Backup failed -- %s", s.ToString().c_str());
backups_.erase(new_backup_id);
GarbageCollection(true);
return s;
}
// here we know that we succeeded and installed the new backup
// in the LATEST_BACKUP file
latest_backup_id_ = new_backup_id;
Log(options_.info_log, "Backup DONE. All is good");
return s;
}
Status BackupEngine::PurgeOldBackups(uint32_t num_backups_to_keep) {
Log(options_.info_log, "Purging old backups, keeping %u",
num_backups_to_keep);
while (num_backups_to_keep < backups_.size()) {
Log(options_.info_log, "Deleting backup %u", backups_.begin()->first);
backups_.begin()->second.Delete();
obsolete_backups_.push_back(backups_.begin()->first);
backups_.erase(backups_.begin());
}
GarbageCollection(false);
return Status::OK();
}
Status BackupEngine::DeleteBackup(BackupID backup_id) {
Log(options_.info_log, "Deleting backup %u", backup_id);
auto backup = backups_.find(backup_id);
if (backup == backups_.end()) {
return Status::NotFound("Backup not found");
}
backup->second.Delete();
obsolete_backups_.push_back(backup_id);
backups_.erase(backup);
GarbageCollection(false);
return Status::OK();
}
void BackupEngine::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
backup_info->reserve(backups_.size());
for (auto& backup : backups_) {
if (!backup.second.Empty()) {
backup_info->push_back(BackupInfo(
backup.first, backup.second.GetTimestamp(), backup.second.GetSize()));
}
}
}
Status BackupEngine::RestoreDBFromBackup(BackupID backup_id,
const std::string &db_dir,
const std::string &wal_dir) {
auto backup_itr = backups_.find(backup_id);
if (backup_itr == backups_.end()) {
return Status::NotFound("Backup not found");
}
auto& backup = backup_itr->second;
if (backup.Empty()) {
return Status::NotFound("Backup not found");
}
Log(options_.info_log, "Restoring backup id %u\n", backup_id);
// just in case. Ignore errors
db_env_->CreateDirIfMissing(db_dir);
db_env_->CreateDirIfMissing(wal_dir);
// delete log files that might have been already in wal_dir.
// This is important since they might get replayed to the restored DB,
// which will then differ from the backuped DB
std::vector<std::string> delete_children;
db_env_->GetChildren(wal_dir, &delete_children); // ignore errors
for (auto f : delete_children) {
db_env_->DeleteFile(wal_dir + "/" + f); // ignore errors
}
// Also delete all the db_dir children. This is not so important
// because obsolete files will be deleted by DBImpl::PurgeObsoleteFiles()
delete_children.clear();
db_env_->GetChildren(db_dir, &delete_children); // ignore errors
for (auto f : delete_children) {
db_env_->DeleteFile(db_dir + "/" + f); // ignore errors
}
Status s;
for (auto& file : backup.GetFiles()) {
std::string dst;
// 1. extract the filename
size_t slash = file.find_last_of('/');
// file will either be shared/<file> or private/<number>/<file>
assert(slash != std::string::npos);
dst = file.substr(slash + 1);
// 2. find the filetype
uint64_t number;
FileType type;
bool ok = ParseFileName(dst, &number, &type);
if (!ok) {
return Status::Corruption("Backup corrupted");
}
// 3. Construct the final path
// kLogFile lives in wal_dir and all the rest live in db_dir
dst = ((type == kLogFile) ? wal_dir : db_dir) +
"/" + dst;
Log(options_.info_log, "Restoring %s to %s\n", file.c_str(), dst.c_str());
s = CopyFile(GetAbsolutePath(file), dst, backup_env_, db_env_, false);
if (!s.ok()) {
break;
}
}
Log(options_.info_log, "Restoring done -- %s\n", s.ToString().c_str());
return s;
}
// latest backup id is an ASCII representation of latest backup id
Status BackupEngine::GetLatestBackupFileContents(uint32_t* latest_backup) {
Status s;
unique_ptr<SequentialFile> file;
s = backup_env_->NewSequentialFile(GetLatestBackupFile(),
&file,
EnvOptions());
if (!s.ok()) {
return s;
}
char buf[11];
Slice data;
s = file->Read(10, &data, buf);
if (!s.ok() || data.size() == 0) {
return s.ok() ? Status::Corruption("Latest backup file corrupted") : s;
}
buf[data.size()] = 0;
*latest_backup = 0;
sscanf(data.data(), "%u", latest_backup);
if (backup_env_->FileExists(GetBackupMetaFile(*latest_backup)) == false) {
s = Status::Corruption("Latest backup file corrupted");
}
return Status::OK();
}
// this operation HAS to be atomic
// writing 4 bytes to the file is atomic alright, but we should *never*
// do something like 1. delete file, 2. write new file
// We write to a tmp file and then atomically rename
Status BackupEngine::PutLatestBackupFileContents(uint32_t latest_backup) {
Status s;
unique_ptr<WritableFile> file;
EnvOptions env_options;
env_options.use_mmap_writes = false;
s = backup_env_->NewWritableFile(GetLatestBackupFile(true),
&file,
env_options);
if (!s.ok()) {
backup_env_->DeleteFile(GetLatestBackupFile(true));
return s;
}
char file_contents[10];
int len = sprintf(file_contents, "%u\n", latest_backup);
s = file->Append(Slice(file_contents, len));
if (s.ok() && options_.sync) {
file->Sync();
}
if (s.ok()) {
s = file->Close();
}
if (s.ok()) {
// atomically replace real file with new tmp
s = backup_env_->RenameFile(GetLatestBackupFile(true),
GetLatestBackupFile(false));
}
return s;
}
Status BackupEngine::CopyFile(const std::string& src,
const std::string& dst,
Env* src_env,
Env* dst_env,
bool sync,
uint64_t* size,
uint64_t size_limit) {
Status s;
unique_ptr<WritableFile> dst_file;
unique_ptr<SequentialFile> src_file;
EnvOptions env_options;
env_options.use_mmap_writes = false;
if (size != nullptr) {
*size = 0;
}
// Check if size limit is set. if not, set it to very big number
if (size_limit == 0) {
size_limit = std::numeric_limits<uint64_t>::max();
}
s = src_env->NewSequentialFile(src, &src_file, env_options);
if (s.ok()) {
s = dst_env->NewWritableFile(dst, &dst_file, env_options);
}
if (!s.ok()) {
return s;
}
unique_ptr<char[]> buf(new char[copy_file_buffer_size_]);
Slice data;
do {
size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ?
copy_file_buffer_size_ : size_limit;
s = src_file->Read(buffer_to_read, &data, buf.get());
size_limit -= data.size();
if (size != nullptr) {
*size += data.size();
}
if (s.ok()) {
s = dst_file->Append(data);
}
} while (s.ok() && data.size() > 0 && size_limit > 0);
if (s.ok() && sync) {
s = dst_file->Sync();
}
return s;
}
// src_fname will always start with "/"
Status BackupEngine::BackupFile(BackupID backup_id,
BackupMeta* backup,
bool shared,
const std::string& src_dir,
const std::string& src_fname,
uint64_t size_limit) {
assert(src_fname.size() > 0 && src_fname[0] == '/');
std::string dst_relative = src_fname.substr(1);
if (shared) {
dst_relative = GetSharedFileRel(dst_relative);
} else {
dst_relative = GetPrivateFileRel(backup_id, dst_relative);
}
std::string dst_path = GetAbsolutePath(dst_relative);
Status s;
uint64_t size;
// if it's shared, we also need to check if it exists -- if it does,
// no need to copy it again
if (shared && backup_env_->FileExists(dst_path)) {
backup_env_->GetFileSize(dst_path, &size); // Ignore error
Log(options_.info_log, "%s already present", src_fname.c_str());
} else {
Log(options_.info_log, "Copying %s", src_fname.c_str());
s = CopyFile(src_dir + src_fname,
dst_path,
db_env_,
backup_env_,
options_.sync,
&size,
size_limit);
}
if (s.ok()) {
backup->AddFile(dst_relative, size);
}
return s;
}
void BackupEngine::GarbageCollection(bool full_scan) {
Log(options_.info_log, "Starting garbage collection");
std::vector<std::string> to_delete;
for (auto& itr : backuped_file_refs_) {
if (itr.second == 0) {
Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
s.ToString().c_str());
to_delete.push_back(itr.first);
}
}
for (auto& td : to_delete) {
backuped_file_refs_.erase(td);
}
if (!full_scan) {
// take care of private dirs -- if full_scan == true, then full_scan will
// take care of them
for (auto backup_id : obsolete_backups_) {
std::string private_dir = GetPrivateFileRel(backup_id);
Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir));
Log(options_.info_log, "Deleting private dir %s -- %s",
private_dir.c_str(), s.ToString().c_str());
}
}
obsolete_backups_.clear();
if (full_scan) {
Log(options_.info_log, "Starting full scan garbage collection");
// delete obsolete shared files
std::vector<std::string> shared_children;
backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()),
&shared_children);
for (auto& child : shared_children) {
std::string rel_fname = GetSharedFileRel(child);
// if it's not refcounted, delete it
if (backuped_file_refs_.find(rel_fname) == backuped_file_refs_.end()) {
// this might be a directory, but DeleteFile will just fail in that
// case, so we're good
Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
if (s.ok()) {
Log(options_.info_log, "Deleted %s", rel_fname.c_str());
}
}
}
// delete obsolete private files
std::vector<std::string> private_children;
backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()),
&private_children);
for (auto& child : private_children) {
BackupID backup_id = 0;
sscanf(child.c_str(), "%u", &backup_id);
if (backup_id == 0 || backups_.find(backup_id) != backups_.end()) {
// it's either not a number or it's still alive. continue
continue;
}
// here we have to delete the dir and all its children
std::string full_private_path =
GetAbsolutePath(GetPrivateFileRel(backup_id));
std::vector<std::string> subchildren;
backup_env_->GetChildren(full_private_path, &subchildren);
for (auto& subchild : subchildren) {
Status s = backup_env_->DeleteFile(full_private_path + subchild);
if (s.ok()) {
Log(options_.info_log, "Deleted %s",
(full_private_path + subchild).c_str());
}
}
// finally delete the private dir
Status s = backup_env_->DeleteDir(full_private_path);
Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(),
s.ToString().c_str());
}
}
}
// ------- BackupMeta class --------
void BackupEngine::BackupMeta::AddFile(const std::string& filename,
uint64_t size) {
size_ += size;
files_.push_back(filename);
auto itr = file_refs_->find(filename);
if (itr == file_refs_->end()) {
file_refs_->insert(std::make_pair(filename, 1));
} else {
++itr->second; // increase refcount if already present
}
}
void BackupEngine::BackupMeta::Delete() {
for (auto& file : files_) {
auto itr = file_refs_->find(file);
assert(itr != file_refs_->end());
--(itr->second); // decrease refcount
}
files_.clear();
// delete meta file
env_->DeleteFile(meta_filename_);
timestamp_ = 0;
}
// each backup meta file is of the format:
// <timestamp>
// <seq number>
// <number of files>
// <file1>
// <file2>
// ...
// TODO: maybe add checksum?
Status BackupEngine::BackupMeta::LoadFromFile(const std::string& backup_dir) {
assert(Empty());
Status s;
unique_ptr<SequentialFile> backup_meta_file;
s = env_->NewSequentialFile(meta_filename_, &backup_meta_file, EnvOptions());
if (!s.ok()) {
return s;
}
unique_ptr<char[]> buf(new char[max_backup_meta_file_size_ + 1]);
Slice data;
s = backup_meta_file->Read(max_backup_meta_file_size_, &data, buf.get());
if (!s.ok() || data.size() == max_backup_meta_file_size_) {
return s.ok() ? Status::IOError("File size too big") : s;
}
buf[data.size()] = 0;
uint32_t num_files = 0;
int bytes_read = 0;
sscanf(data.data(), "%" PRId64 "%n", &timestamp_, &bytes_read);
data.remove_prefix(bytes_read + 1); // +1 for '\n'
sscanf(data.data(), "%" PRIu64 "%n", &sequence_number_, &bytes_read);
data.remove_prefix(bytes_read + 1); // +1 for '\n'
sscanf(data.data(), "%u%n", &num_files, &bytes_read);
data.remove_prefix(bytes_read + 1); // +1 for '\n'
std::vector<std::pair<std::string, uint64_t>> files;
for (uint32_t i = 0; s.ok() && i < num_files; ++i) {
std::string filename = GetSliceUntil(&data, '\n').ToString();
uint64_t size;
s = env_->GetFileSize(backup_dir + "/" + filename, &size);
files.push_back(std::make_pair(filename, size));
}
if (s.ok()) {
for (auto file : files) {
AddFile(file.first, file.second);
}
}
return s;
}
Status BackupEngine::BackupMeta::StoreToFile(bool sync) {
Status s;
unique_ptr<WritableFile> backup_meta_file;
EnvOptions env_options;
env_options.use_mmap_writes = false;
s = env_->NewWritableFile(meta_filename_ + ".tmp", &backup_meta_file,
env_options);
if (!s.ok()) {
return s;
}
unique_ptr<char[]> buf(new char[max_backup_meta_file_size_]);
int len = 0, buf_size = max_backup_meta_file_size_;
len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_);
len += snprintf(buf.get() + len, buf_size - len, "%" PRIu64 "\n",
sequence_number_);
len += snprintf(buf.get() + len, buf_size - len, "%zu\n", files_.size());
for (size_t i = 0; i < files_.size(); ++i) {
len += snprintf(buf.get() + len, buf_size - len, "%s\n", files_[i].c_str());
}
s = backup_meta_file->Append(Slice(buf.get(), (size_t)len));
if (s.ok() && sync) {
s = backup_meta_file->Sync();
}
if (s.ok()) {
s = backup_meta_file->Close();
}
if (s.ok()) {
s = env_->RenameFile(meta_filename_ + ".tmp", meta_filename_);
}
return s;
}
// --- BackupableDB methods --------
BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options)
: StackableDB(db), backup_engine_(new BackupEngine(db->GetEnv(), options)) {
backup_engine_->DeleteBackupsNewerThan(GetLatestSequenceNumber());
}
BackupableDB::~BackupableDB() {
delete backup_engine_;
}
Status BackupableDB::CreateNewBackup(bool flush_before_backup) {
return backup_engine_->CreateNewBackup(this, flush_before_backup);
}
void BackupableDB::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
backup_engine_->GetBackupInfo(backup_info);
}
Status BackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) {
return backup_engine_->PurgeOldBackups(num_backups_to_keep);
}
Status BackupableDB::DeleteBackup(BackupID backup_id) {
return backup_engine_->DeleteBackup(backup_id);
}
// --- RestoreBackupableDB methods ------
RestoreBackupableDB::RestoreBackupableDB(Env* db_env,
const BackupableDBOptions& options)
: backup_engine_(new BackupEngine(db_env, options)) {}
RestoreBackupableDB::~RestoreBackupableDB() {
delete backup_engine_;
}
void
RestoreBackupableDB::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
backup_engine_->GetBackupInfo(backup_info);
}
Status RestoreBackupableDB::RestoreDBFromBackup(BackupID backup_id,
const std::string& db_dir,
const std::string& wal_dir) {
return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir);
}
Status
RestoreBackupableDB::RestoreDBFromLatestBackup(const std::string& db_dir,
const std::string& wal_dir) {
return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir);
}
Status RestoreBackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) {
return backup_engine_->PurgeOldBackups(num_backups_to_keep);
}
Status RestoreBackupableDB::DeleteBackup(BackupID backup_id) {
return backup_engine_->DeleteBackup(backup_id);
}
} // namespace rocksdb

@ -0,0 +1,668 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "rocksdb/types.h"
#include "rocksdb/transaction_log.h"
#include "utilities/utility_db.h"
#include "utilities/backupable_db.h"
#include "util/testharness.h"
#include "util/random.h"
#include "util/testutil.h"
#include "util/auto_roll_logger.h"
#include <string>
#include <algorithm>
namespace rocksdb {
namespace {
using std::unique_ptr;
class DummyDB : public StackableDB {
public:
/* implicit */
DummyDB(const Options& options, const std::string& dbname)
: StackableDB(nullptr), options_(options), dbname_(dbname),
deletions_enabled_(true), sequence_number_(0) {}
virtual SequenceNumber GetLatestSequenceNumber() const {
return ++sequence_number_;
}
virtual const std::string& GetName() const override {
return dbname_;
}
virtual Env* GetEnv() const override {
return options_.env;
}
virtual const Options& GetOptions() const override {
return options_;
}
virtual Status EnableFileDeletions() override {
ASSERT_TRUE(!deletions_enabled_);
deletions_enabled_ = true;
return Status::OK();
}
virtual Status DisableFileDeletions() override {
ASSERT_TRUE(deletions_enabled_);
deletions_enabled_ = false;
return Status::OK();
}
virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
bool flush_memtable = true) override {
ASSERT_TRUE(!deletions_enabled_);
vec = live_files_;
*mfs = 100;
return Status::OK();
}
class DummyLogFile : public LogFile {
public:
/* implicit */
DummyLogFile(const std::string& path, bool alive = true)
: path_(path), alive_(alive) {}
virtual std::string PathName() const override {
return path_;
}
virtual uint64_t LogNumber() const {
// what business do you have calling this method?
ASSERT_TRUE(false);
return 0;
}
virtual WalFileType Type() const override {
return alive_ ? kAliveLogFile : kArchivedLogFile;
}
virtual SequenceNumber StartSequence() const {
// backupabledb should not need this method
ASSERT_TRUE(false);
return 0;
}
virtual uint64_t SizeFileBytes() const {
// backupabledb should not need this method
ASSERT_TRUE(false);
return 0;
}
private:
std::string path_;
bool alive_;
}; // DummyLogFile
virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
ASSERT_TRUE(!deletions_enabled_);
files.resize(wal_files_.size());
for (size_t i = 0; i < files.size(); ++i) {
files[i].reset(
new DummyLogFile(wal_files_[i].first, wal_files_[i].second));
}
return Status::OK();
}
std::vector<std::string> live_files_;
// pair<filename, alive?>
std::vector<std::pair<std::string, bool>> wal_files_;
private:
Options options_;
std::string dbname_;
bool deletions_enabled_;
mutable SequenceNumber sequence_number_;
}; // DummyDB
class TestEnv : public EnvWrapper {
public:
explicit TestEnv(Env* t) : EnvWrapper(t) {}
class DummySequentialFile : public SequentialFile {
public:
DummySequentialFile() : SequentialFile(), rnd_(5) {}
virtual Status Read(size_t n, Slice* result, char* scratch) {
size_t read_size = (n > size_left) ? size_left : n;
for (size_t i = 0; i < read_size; ++i) {
scratch[i] = rnd_.Next() & 255;
}
*result = Slice(scratch, read_size);
size_left -= read_size;
return Status::OK();
}
virtual Status Skip(uint64_t n) {
size_left = (n > size_left) ? size_left - n : 0;
return Status::OK();
}
private:
size_t size_left = 200;
Random rnd_;
};
Status NewSequentialFile(const std::string& f,
unique_ptr<SequentialFile>* r,
const EnvOptions& options) {
opened_files_.push_back(f);
if (dummy_sequential_file_) {
r->reset(new TestEnv::DummySequentialFile());
return Status::OK();
} else {
return EnvWrapper::NewSequentialFile(f, r, options);
}
}
Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
const EnvOptions& options) {
if (limit_written_files_ <= 0) {
return Status::IOError("Sorry, can't do this");
}
limit_written_files_--;
return EnvWrapper::NewWritableFile(f, r, options);
}
void AssertOpenedFiles(std::vector<std::string>& should_have_opened) {
sort(should_have_opened.begin(), should_have_opened.end());
sort(opened_files_.begin(), opened_files_.end());
ASSERT_TRUE(opened_files_ == should_have_opened);
}
void ClearOpenedFiles() {
opened_files_.clear();
}
void SetLimitWrittenFiles(uint64_t limit) {
limit_written_files_ = limit;
}
void SetDummySequentialFile(bool dummy_sequential_file) {
dummy_sequential_file_ = dummy_sequential_file;
}
private:
bool dummy_sequential_file_ = false;
std::vector<std::string> opened_files_;
uint64_t limit_written_files_ = 1000000;
}; // TestEnv
class FileManager : public EnvWrapper {
public:
explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {}
Status DeleteRandomFileInDir(const std::string dir) {
std::vector<std::string> children;
GetChildren(dir, &children);
if (children.size() <= 2) { // . and ..
return Status::NotFound("");
}
while (true) {
int i = rnd_.Next() % children.size();
if (children[i] != "." && children[i] != "..") {
return DeleteFile(dir + "/" + children[i]);
}
}
// should never get here
assert(false);
return Status::NotFound("");
}
Status CorruptFile(const std::string& fname, uint64_t bytes_to_corrupt) {
uint64_t size;
Status s = GetFileSize(fname, &size);
if (!s.ok()) {
return s;
}
unique_ptr<RandomRWFile> file;
EnvOptions env_options;
env_options.use_mmap_writes = false;
s = NewRandomRWFile(fname, &file, env_options);
if (!s.ok()) {
return s;
}
for (uint64_t i = 0; s.ok() && i < bytes_to_corrupt; ++i) {
std::string tmp;
// write one random byte to a random position
s = file->Write(rnd_.Next() % size, test::RandomString(&rnd_, 1, &tmp));
}
return s;
}
Status WriteToFile(const std::string& fname, const std::string& data) {
unique_ptr<WritableFile> file;
EnvOptions env_options;
env_options.use_mmap_writes = false;
Status s = EnvWrapper::NewWritableFile(fname, &file, env_options);
if (!s.ok()) {
return s;
}
return file->Append(Slice(data));
}
private:
Random rnd_;
}; // FileManager
// utility functions
static void FillDB(DB* db, int from, int to) {
for (int i = from; i < to; ++i) {
std::string key = "testkey" + std::to_string(i);
std::string value = "testvalue" + std::to_string(i);
ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
}
}
static void AssertExists(DB* db, int from, int to) {
for (int i = from; i < to; ++i) {
std::string key = "testkey" + std::to_string(i);
std::string value;
Status s = db->Get(ReadOptions(), Slice(key), &value);
ASSERT_EQ(value, "testvalue" + std::to_string(i));
}
}
static void AssertEmpty(DB* db, int from, int to) {
for (int i = from; i < to; ++i) {
std::string key = "testkey" + std::to_string(i);
std::string value = "testvalue" + std::to_string(i);
Status s = db->Get(ReadOptions(), Slice(key), &value);
ASSERT_TRUE(s.IsNotFound());
}
}
class BackupableDBTest {
public:
BackupableDBTest() {
// set up files
dbname_ = test::TmpDir() + "/backupable_db";
backupdir_ = test::TmpDir() + "/backupable_db_backup";
// set up envs
env_ = Env::Default();
test_db_env_.reset(new TestEnv(env_));
test_backup_env_.reset(new TestEnv(env_));
file_manager_.reset(new FileManager(env_));
// set up db options
options_.create_if_missing = true;
options_.paranoid_checks = true;
options_.write_buffer_size = 1 << 17; // 128KB
options_.env = test_db_env_.get();
options_.wal_dir = dbname_;
// set up backup db options
CreateLoggerFromOptions(dbname_, backupdir_, env_,
Options(), &logger_);
backupable_options_.reset(new BackupableDBOptions(
backupdir_, test_backup_env_.get(), logger_.get(), true));
// delete old files in db
DestroyDB(dbname_, Options());
}
DB* OpenDB() {
DB* db;
ASSERT_OK(DB::Open(options_, dbname_, &db));
return db;
}
void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false) {
// reset all the defaults
test_backup_env_->SetLimitWrittenFiles(1000000);
test_db_env_->SetLimitWrittenFiles(1000000);
test_db_env_->SetDummySequentialFile(dummy);
DB* db;
if (dummy) {
dummy_db_ = new DummyDB(options_, dbname_);
db = dummy_db_;
} else {
ASSERT_OK(DB::Open(options_, dbname_, &db));
}
backupable_options_->destroy_old_data = destroy_old_data;
db_.reset(new BackupableDB(db, *backupable_options_));
}
void CloseBackupableDB() {
db_.reset(nullptr);
}
void OpenRestoreDB() {
backupable_options_->destroy_old_data = false;
restore_db_.reset(
new RestoreBackupableDB(test_db_env_.get(), *backupable_options_));
}
void CloseRestoreDB() {
restore_db_.reset(nullptr);
}
// restores backup backup_id and asserts the existence of
// [start_exist, end_exist> and not-existence of
// [end_exist, end>
//
// if backup_id == 0, it means restore from latest
// if end == 0, don't check AssertEmpty
void AssertBackupConsistency(BackupID backup_id, uint32_t start_exist,
uint32_t end_exist, uint32_t end = 0) {
bool opened_restore = false;
if (restore_db_.get() == nullptr) {
opened_restore = true;
OpenRestoreDB();
}
if (backup_id > 0) {
ASSERT_OK(restore_db_->RestoreDBFromBackup(backup_id, dbname_, dbname_));
} else {
ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_));
}
DB* db = OpenDB();
AssertExists(db, start_exist, end_exist);
if (end != 0) {
AssertEmpty(db, end_exist, end);
}
delete db;
if (opened_restore) {
CloseRestoreDB();
}
}
// files
std::string dbname_;
std::string backupdir_;
// envs
Env* env_;
unique_ptr<TestEnv> test_db_env_;
unique_ptr<TestEnv> test_backup_env_;
unique_ptr<FileManager> file_manager_;
// all the dbs!
DummyDB* dummy_db_; // BackupableDB owns dummy_db_
unique_ptr<BackupableDB> db_;
unique_ptr<RestoreBackupableDB> restore_db_;
// options
Options options_;
unique_ptr<BackupableDBOptions> backupable_options_;
std::shared_ptr<Logger> logger_;
}; // BackupableDBTest
void AppendPath(const std::string& path, std::vector<std::string>& v) {
for (auto& f : v) {
f = path + f;
}
}
// this will make sure that backup does not copy the same file twice
TEST(BackupableDBTest, NoDoubleCopy) {
OpenBackupableDB(true, true);
// should write 5 DB files + LATEST_BACKUP + one meta file
test_backup_env_->SetLimitWrittenFiles(7);
test_db_env_->ClearOpenedFiles();
test_db_env_->SetLimitWrittenFiles(0);
dummy_db_->live_files_ = { "/00010.sst", "/00011.sst",
"/CURRENT", "/MANIFEST-01" };
dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
ASSERT_OK(db_->CreateNewBackup(false));
std::vector<std::string> should_have_openened = dummy_db_->live_files_;
should_have_openened.push_back("/00011.log");
AppendPath(dbname_, should_have_openened);
test_db_env_->AssertOpenedFiles(should_have_openened);
// should write 4 new DB files + LATEST_BACKUP + one meta file
// should not write/copy 00010.sst, since it's already there!
test_backup_env_->SetLimitWrittenFiles(6);
test_db_env_->ClearOpenedFiles();
dummy_db_->live_files_ = { "/00010.sst", "/00015.sst",
"/CURRENT", "/MANIFEST-01" };
dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
ASSERT_OK(db_->CreateNewBackup(false));
// should not open 00010.sst - it's already there
should_have_openened = { "/00015.sst", "/CURRENT",
"/MANIFEST-01", "/00011.log" };
AppendPath(dbname_, should_have_openened);
test_db_env_->AssertOpenedFiles(should_have_openened);
ASSERT_OK(db_->DeleteBackup(1));
ASSERT_EQ(true,
test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst"));
// 00011.sst was only in backup 1, should be deleted
ASSERT_EQ(false,
test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst"));
ASSERT_EQ(true,
test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
// MANIFEST file size should be only 100
uint64_t size;
test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01", &size);
ASSERT_EQ(100UL, size);
test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size);
ASSERT_EQ(200UL, size);
CloseBackupableDB();
}
// test various kind of corruptions that may happen:
// 1. Not able to write a file for backup - that backup should fail,
// everything else should work
// 2. Corrupted/deleted LATEST_BACKUP - everything should work fine
// 3. Corrupted backup meta file or missing backuped file - we should
// not be able to open that backup, but all other backups should be
// fine
TEST(BackupableDBTest, CorruptionsTest) {
const int keys_iteration = 5000;
Random rnd(6);
Status s;
OpenBackupableDB(true);
// create five backups
for (int i = 0; i < 5; ++i) {
FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
}
// ---------- case 1. - fail a write -----------
// try creating backup 6, but fail a write
FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6);
test_backup_env_->SetLimitWrittenFiles(2);
// should fail
s = db_->CreateNewBackup(!!(rnd.Next() % 2));
ASSERT_TRUE(!s.ok());
test_backup_env_->SetLimitWrittenFiles(1000000);
// latest backup should have all the keys
CloseBackupableDB();
AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6);
// ---------- case 2. - corrupt/delete latest backup -----------
ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/LATEST_BACKUP", 2));
AssertBackupConsistency(0, 0, keys_iteration * 5);
ASSERT_OK(file_manager_->DeleteFile(backupdir_ + "/LATEST_BACKUP"));
AssertBackupConsistency(0, 0, keys_iteration * 5);
// create backup 6, point LATEST_BACKUP to 5
OpenBackupableDB();
FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6);
ASSERT_OK(db_->CreateNewBackup(false));
CloseBackupableDB();
ASSERT_OK(file_manager_->WriteToFile(backupdir_ + "/LATEST_BACKUP", "5"));
AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6);
// assert that all 6 data is gone!
ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/6") == false);
ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/6") == false);
// --------- case 3. corrupted backup meta or missing backuped file ----
ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/5", 3));
// since 5 meta is now corrupted, latest backup should be 4
AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5);
OpenRestoreDB();
s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_);
ASSERT_TRUE(!s.ok());
CloseRestoreDB();
ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4"));
// 4 is corrupted, 3 is the latest backup now
AssertBackupConsistency(0, 0, keys_iteration * 3, keys_iteration * 5);
OpenRestoreDB();
s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_);
CloseRestoreDB();
ASSERT_TRUE(!s.ok());
// new backup should be 4!
OpenBackupableDB();
FillDB(db_.get(), keys_iteration * 3, keys_iteration * 4);
ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
CloseBackupableDB();
AssertBackupConsistency(4, 0, keys_iteration * 4, keys_iteration * 5);
}
// open DB, write, close DB, backup, restore, repeat
TEST(BackupableDBTest, OfflineIntegrationTest) {
// has to be a big number, so that it triggers the memtable flush
const int keys_iteration = 5000;
const int max_key = keys_iteration * 4 + 10;
// first iter -- flush before backup
// second iter -- don't flush before backup
for (int iter = 0; iter < 2; ++iter) {
// delete old data
DestroyDB(dbname_, Options());
bool destroy_data = true;
// every iteration --
// 1. insert new data in the DB
// 2. backup the DB
// 3. destroy the db
// 4. restore the db, check everything is still there
for (int i = 0; i < 5; ++i) {
// in last iteration, put smaller amount of data,
int fill_up_to = std::min(keys_iteration * (i + 1), max_key);
// ---- insert new data and back up ----
OpenBackupableDB(destroy_data);
destroy_data = false;
FillDB(db_.get(), keys_iteration * i, fill_up_to);
ASSERT_OK(db_->CreateNewBackup(iter == 0));
CloseBackupableDB();
DestroyDB(dbname_, Options());
// ---- make sure it's empty ----
DB* db = OpenDB();
AssertEmpty(db, 0, fill_up_to);
delete db;
// ---- restore the DB ----
OpenRestoreDB();
if (i >= 3) { // test purge old backups
// when i == 4, purge to only 1 backup
// when i == 3, purge to 2 backups
ASSERT_OK(restore_db_->PurgeOldBackups(5 - i));
}
// ---- make sure the data is there ---
AssertBackupConsistency(0, 0, fill_up_to, max_key);
CloseRestoreDB();
}
}
}
// open DB, write, backup, write, backup, close, restore
TEST(BackupableDBTest, OnlineIntegrationTest) {
// has to be a big number, so that it triggers the memtable flush
const int keys_iteration = 5000;
const int max_key = keys_iteration * 4 + 10;
Random rnd(7);
// delete old data
DestroyDB(dbname_, Options());
OpenBackupableDB(true);
// write some data, backup, repeat
for (int i = 0; i < 5; ++i) {
if (i == 4) {
// delete backup number 2, online delete!
OpenRestoreDB();
ASSERT_OK(restore_db_->DeleteBackup(2));
CloseRestoreDB();
}
// in last iteration, put smaller amount of data,
// so that backups can share sst files
int fill_up_to = std::min(keys_iteration * (i + 1), max_key);
FillDB(db_.get(), keys_iteration * i, fill_up_to);
// we should get consistent results with flush_before_backup
// set to both true and false
ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
}
// close and destroy
CloseBackupableDB();
DestroyDB(dbname_, Options());
// ---- make sure it's empty ----
DB* db = OpenDB();
AssertEmpty(db, 0, max_key);
delete db;
// ---- restore every backup and verify all the data is there ----
OpenRestoreDB();
for (int i = 1; i <= 5; ++i) {
if (i == 2) {
// we deleted backup 2
Status s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_);
ASSERT_TRUE(!s.ok());
} else {
int fill_up_to = std::min(keys_iteration * i, max_key);
AssertBackupConsistency(i, 0, fill_up_to, max_key);
}
}
// delete some backups -- this should leave only backups 3 and 5 alive
ASSERT_OK(restore_db_->DeleteBackup(4));
ASSERT_OK(restore_db_->PurgeOldBackups(2));
std::vector<BackupInfo> backup_info;
restore_db_->GetBackupInfo(&backup_info);
ASSERT_EQ(2UL, backup_info.size());
// check backup 3
AssertBackupConsistency(3, 0, 3 * keys_iteration, max_key);
// check backup 5
AssertBackupConsistency(5, 0, max_key);
CloseRestoreDB();
}
TEST(BackupableDBTest, DeleteNewerBackups) {
// create backups 1, 2, 3, 4, 5
OpenBackupableDB(true);
for (int i = 0; i < 5; ++i) {
FillDB(db_.get(), 100 * i, 100 * (i + 1));
ASSERT_OK(db_->CreateNewBackup(!!(i % 2)));
}
CloseBackupableDB();
// backup 3 is fine
AssertBackupConsistency(3, 0, 300, 500);
// this should delete backups 4 and 5
OpenBackupableDB();
CloseBackupableDB();
// backups 4 and 5 don't exist
OpenRestoreDB();
Status s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_);
ASSERT_TRUE(s.IsNotFound());
s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_);
ASSERT_TRUE(s.IsNotFound());
CloseRestoreDB();
}
} // anon namespace
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

Binary file not shown.

@ -41,9 +41,7 @@ std::shared_ptr<DB> OpenTtlDb(char delim_char) {
Options options;
options.create_if_missing = true;
options.merge_operator.reset(new StringAppendTESTOperator(delim_char));
Status s;
db = new DBWithTTL(123456, options, kDbName, s, false);
ASSERT_OK(s);
ASSERT_OK(UtilityDB::OpenTtlDB(options, kDbName, &db, 123456));
return std::shared_ptr<DB>(db);
}
@ -53,6 +51,7 @@ class StringLists {
public:
//Constructor: specifies the rocksdb db
/* implicit */
StringLists(std::shared_ptr<DB> db)
: db_(db),
merge_option_(),
@ -75,7 +74,7 @@ class StringLists {
// Returns the list of strings associated with key (or "" if does not exist)
bool Get(const std::string& key, std::string* const result){
assert(result != NULL); // we should have a place to store the result
assert(result != nullptr); // we should have a place to store the result
auto s = db_->Get(get_option_, key, result);
if (s.ok()) {

@ -10,40 +10,27 @@
namespace rocksdb {
// Open the db inside DBWithTTL because options needs pointer to its ttl
DBWithTTL::DBWithTTL(const int32_t ttl,
const Options& options,
const std::string& dbname,
Status& st,
bool read_only)
: StackableDB(nullptr) {
Options options_to_open = options;
if (options.compaction_filter) {
ttl_comp_filter_.reset(
new TtlCompactionFilter(ttl, options.compaction_filter));
options_to_open.compaction_filter = ttl_comp_filter_.get();
void DBWithTTL::SanitizeOptions(int32_t ttl, Options* options) {
if (options->compaction_filter) {
options->compaction_filter =
new TtlCompactionFilter(ttl, options->compaction_filter);
} else {
options_to_open.compaction_filter_factory =
std::shared_ptr<CompactionFilterFactory>(
new TtlCompactionFilterFactory(
ttl, options.compaction_filter_factory));
options->compaction_filter_factory =
std::shared_ptr<CompactionFilterFactory>(new TtlCompactionFilterFactory(
ttl, options->compaction_filter_factory));
}
if (options.merge_operator) {
options_to_open.merge_operator.reset(
new TtlMergeOperator(options.merge_operator));
}
if (read_only) {
st = DB::OpenForReadOnly(options_to_open, dbname, &db_);
} else {
st = DB::Open(options_to_open, dbname, &db_);
if (options->merge_operator) {
options->merge_operator.reset(
new TtlMergeOperator(options->merge_operator));
}
}
// Open the db inside DBWithTTL because options needs pointer to its ttl
DBWithTTL::DBWithTTL(DB* db) : StackableDB(db) {}
DBWithTTL::~DBWithTTL() {
delete db_;
delete GetOptions().compaction_filter;
}
Status UtilityDB::OpenTtlDB(
@ -53,9 +40,19 @@ Status UtilityDB::OpenTtlDB(
int32_t ttl,
bool read_only) {
Status st;
*dbptr = new DBWithTTL(ttl, options, dbname, st, read_only);
if (!st.ok()) {
delete *dbptr;
Options options_to_open = options;
DBWithTTL::SanitizeOptions(ttl, &options_to_open);
DB* db;
if (read_only) {
st = DB::OpenForReadOnly(options_to_open, dbname, &db);
} else {
st = DB::Open(options_to_open, dbname, &db);
}
if (st.ok()) {
*dbptr = new DBWithTTL(db);
} else {
delete db;
}
return st;
}
@ -122,10 +119,8 @@ Status DBWithTTL::StripTS(std::string* str) {
return st;
}
Status DBWithTTL::Put(
const WriteOptions& opt,
const Slice& key,
const Slice& val) {
Status DBWithTTL::Put(const WriteOptions& opt, const Slice& key,
const Slice& val) {
WriteBatch batch;
batch.Put(key, val);
return Write(opt, &batch);
@ -166,10 +161,6 @@ bool DBWithTTL::KeyMayExist(const ReadOptions& options,
return ret;
}
Status DBWithTTL::Delete(const WriteOptions& wopts, const Slice& key) {
return db_->Delete(wopts, key);
}
Status DBWithTTL::Merge(const WriteOptions& opt,
const Slice& key,
const Slice& value) {
@ -221,86 +212,6 @@ Iterator* DBWithTTL::NewIterator(const ReadOptions& opts) {
return new TtlIterator(db_->NewIterator(opts));
}
const Snapshot* DBWithTTL::GetSnapshot() {
return db_->GetSnapshot();
}
void DBWithTTL::ReleaseSnapshot(const Snapshot* snapshot) {
db_->ReleaseSnapshot(snapshot);
}
bool DBWithTTL::GetProperty(const Slice& property, std::string* value) {
return db_->GetProperty(property, value);
}
void DBWithTTL::GetApproximateSizes(const Range* r, int n, uint64_t* sizes) {
db_->GetApproximateSizes(r, n, sizes);
}
void DBWithTTL::CompactRange(const Slice* begin, const Slice* end,
bool reduce_level, int target_level) {
db_->CompactRange(begin, end, reduce_level, target_level);
}
int DBWithTTL::NumberLevels() {
return db_->NumberLevels();
}
int DBWithTTL::MaxMemCompactionLevel() {
return db_->MaxMemCompactionLevel();
}
int DBWithTTL::Level0StopWriteTrigger() {
return db_->Level0StopWriteTrigger();
}
Env* DBWithTTL::GetEnv() const {
return db_->GetEnv();
}
const Options& DBWithTTL::GetOptions() const {
return db_->GetOptions();
}
Status DBWithTTL::Flush(const FlushOptions& fopts) {
return db_->Flush(fopts);
}
Status DBWithTTL::DisableFileDeletions() {
return db_->DisableFileDeletions();
}
Status DBWithTTL::EnableFileDeletions() {
return db_->EnableFileDeletions();
}
Status DBWithTTL::GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
bool flush_memtable) {
return db_->GetLiveFiles(vec, mfs, flush_memtable);
}
SequenceNumber DBWithTTL::GetLatestSequenceNumber() const {
return db_->GetLatestSequenceNumber();
}
Status DBWithTTL::GetSortedWalFiles(VectorLogPtr& files) {
return db_->GetSortedWalFiles(files);
}
Status DBWithTTL::DeleteFile(std::string name) {
return db_->DeleteFile(name);
}
Status DBWithTTL::GetDbIdentity(std::string& identity) {
return db_->GetDbIdentity(identity);
}
Status DBWithTTL::GetUpdatesSince(
SequenceNumber seq_number,
unique_ptr<TransactionLogIterator>* iter) {
return db_->GetUpdatesSince(seq_number, iter);
}
void DBWithTTL::TEST_Destroy_DBWithTtl() {
((DBImpl*) db_)->TEST_Destroy_DBImpl();
}

@ -14,82 +14,33 @@ namespace rocksdb {
class DBWithTTL : public StackableDB {
public:
DBWithTTL(const int32_t ttl,
const Options& options,
const std::string& dbname,
Status& st,
bool read_only);
static void SanitizeOptions(int32_t ttl, Options* options);
explicit DBWithTTL(DB* db);
virtual ~DBWithTTL();
virtual Status Put(const WriteOptions& o,
const Slice& key,
const Slice& val);
virtual Status Put(const WriteOptions& o, const Slice& key,
const Slice& val) override;
virtual Status Get(const ReadOptions& options,
const Slice& key,
std::string* value);
virtual Status Get(const ReadOptions& options, const Slice& key,
std::string* value) override;
virtual std::vector<Status> MultiGet(const ReadOptions& options,
const std::vector<Slice>& keys,
std::vector<std::string>* values);
virtual std::vector<Status> MultiGet(
const ReadOptions& options, const std::vector<Slice>& keys,
std::vector<std::string>* values) override;
virtual bool KeyMayExist(const ReadOptions& options,
const Slice& key,
std::string* value,
bool* value_found = nullptr) override;
virtual Status Delete(const WriteOptions& wopts, const Slice& key);
virtual Status Merge(const WriteOptions& options,
const Slice& key,
const Slice& value);
virtual Status Write(const WriteOptions& opts, WriteBatch* updates);
virtual Iterator* NewIterator(const ReadOptions& opts);
virtual const Snapshot* GetSnapshot();
virtual void ReleaseSnapshot(const Snapshot* snapshot);
virtual bool GetProperty(const Slice& property, std::string* value);
virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes);
virtual void CompactRange(const Slice* begin, const Slice* end,
bool reduce_level = false, int target_level = -1);
virtual int NumberLevels();
virtual int MaxMemCompactionLevel();
virtual Status Merge(const WriteOptions& options, const Slice& key,
const Slice& value) override;
virtual int Level0StopWriteTrigger();
virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
virtual Env* GetEnv() const;
virtual const Options& GetOptions() const;
virtual Status Flush(const FlushOptions& fopts);
virtual Status DisableFileDeletions();
virtual Status EnableFileDeletions();
virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
bool flush_memtable = true);
virtual Status GetSortedWalFiles(VectorLogPtr& files);
virtual Status DeleteFile(std::string name);
virtual Status GetDbIdentity(std::string& identity);
virtual SequenceNumber GetLatestSequenceNumber() const;
virtual Status GetUpdatesSince(SequenceNumber seq_number,
unique_ptr<TransactionLogIterator>* iter);
virtual Iterator* NewIterator(const ReadOptions& opts) override;
// Simulate a db crash, no elegant closing of database.
void TEST_Destroy_DBWithTtl();
@ -113,10 +64,6 @@ class DBWithTTL : public StackableDB {
static const int32_t kMinTimestamp = 1368146402; // 05/09/2013:5:40PM GMT-8
static const int32_t kMaxTimestamp = 2147483647; // 01/18/2038:7:14PM GMT-8
private:
DB* db_;
unique_ptr<CompactionFilter> ttl_comp_filter_;
};
class TtlIterator : public Iterator {

Loading…
Cancel
Save