Merge branch 'master' of github.com:facebook/rocksdb into HEAD

main
Yueh-Hsuan Chiang 11 years ago
commit c65448f95a
  1. 6
      .gitignore
  2. 13
      HISTORY.md
  3. 3
      INSTALL.md
  4. 34
      Makefile
  5. 9
      build_tools/build_detect_platform
  6. 82
      db/c.cc
  7. 1
      db/c_test.c
  8. 489
      db/column_family.cc
  9. 408
      db/column_family.h
  10. 857
      db/column_family_test.cc
  11. 32
      db/compaction.cc
  12. 8
      db/compaction.h
  13. 16
      db/compaction_picker.cc
  14. 2
      db/compaction_picker.h
  15. 633
      db/db_bench.cc
  16. 74
      db/db_filesnapshot.cc
  17. 1775
      db/db_impl.cc
  18. 287
      db/db_impl.h
  19. 51
      db/db_impl_readonly.cc
  20. 116
      db/db_impl_readonly.h
  21. 91
      db/db_iter.cc
  22. 2
      db/db_stats_logger.cc
  23. 2385
      db/db_test.cc
  24. 4
      db/dbformat.cc
  25. 73
      db/dbformat.h
  26. 18
      db/internal_stats.cc
  27. 4
      db/internal_stats.h
  28. 22
      db/memtable.cc
  29. 16
      db/memtable.h
  30. 22
      db/memtable_list.cc
  31. 16
      db/memtable_list.h
  32. 42
      db/plain_table_db_test.cc
  33. 15
      db/repair.cc
  34. 17
      db/table_cache.cc
  35. 6
      db/table_cache.h
  36. 26
      db/tailing_iter.cc
  37. 5
      db/tailing_iter.h
  38. 2
      db/transaction_log_impl.cc
  39. 4
      db/transaction_log_impl.h
  40. 75
      db/version_edit.cc
  41. 57
      db/version_edit.h
  42. 13
      db/version_edit_test.cc
  43. 1027
      db/version_set.cc
  44. 131
      db/version_set.h
  45. 283
      db/write_batch.cc
  46. 63
      db/write_batch_internal.h
  47. 86
      db/write_batch_test.cc
  48. 18
      include/rocksdb/c.h
  49. 176
      include/rocksdb/db.h
  50. 4
      include/rocksdb/env.h
  51. 16
      include/rocksdb/memtablerep.h
  52. 501
      include/rocksdb/options.h
  53. 4
      include/rocksdb/perf_context.h
  54. 60
      include/rocksdb/write_batch.h
  55. 114
      include/utilities/stackable_db.h
  56. 4
      java/rocksjni/write_batch.cc
  57. 7
      port/port_example.h
  58. 4
      port/port_posix.h
  59. 6
      table/block_based_table_reader.cc
  60. 3
      table/filter_block.h
  61. 13
      table/format.cc
  62. 36
      table/merger.cc
  63. 3
      table/merger.h
  64. 9
      table/plain_table_reader.cc
  65. 3
      table/table_test.cc
  66. 71
      tools/auto_sanity_test.sh
  67. 1
      tools/db_crashtest.py
  68. 1
      tools/db_crashtest2.py
  69. 594
      tools/db_stress.cc
  70. 2
      util/auto_roll_logger.cc
  71. 2
      util/auto_roll_logger.h
  72. 2
      util/auto_roll_logger_test.cc
  73. 30
      util/crc32c.cc
  74. 26
      util/dynamic_bloom_test.cc
  75. 6
      util/env.cc
  76. 48
      util/hash_linklist_rep.cc
  77. 8
      util/hash_skiplist_rep.cc
  78. 72
      util/ldb_cmd.cc
  79. 17
      util/ldb_cmd.h
  80. 1
      util/ldb_tool.cc
  81. 323
      util/options.cc
  82. 20
      util/perf_context.cc
  83. 84
      util/perf_context_imp.h
  84. 6
      util/skiplistrep.cc
  85. 62
      util/sync_point.cc
  86. 79
      util/sync_point.h
  87. 1
      util/thread_local.h
  88. 8
      util/vectorrep.cc
  89. 32
      utilities/backupable/backupable_db_test.cc
  90. 6
      utilities/geodb/geodb_test.cc
  91. 34
      utilities/ttl/db_ttl.cc
  92. 27
      utilities/ttl/db_ttl.h

6
.gitignore vendored

@ -13,6 +13,10 @@ build_config.mk
*_bench *_bench
*_stress *_stress
*.out *.out
*.class
*.jar
*.*jnilib*
*.d-e
ldb ldb
manifest_dump manifest_dump
@ -23,3 +27,5 @@ coverage/COVERAGE_REPORT
.gdbhistory .gdbhistory
.phutil_module_cache .phutil_module_cache
tags tags
java/*.log
java/include/org_rocksdb_*.h

@ -1,11 +1,15 @@
# Rocksdb Change Log # Rocksdb Change Log
## Unreleased ## Unreleased (will be released in 3.0)
* Column family support
### Public API changes ### Public API changes
## 2.8.0 (04/04/2014)
* Removed arena.h from public header files. * Removed arena.h from public header files.
* By default, checksums are verified on every read from database * By default, checksums are verified on every read from database
* Change default value of several options, including: paranoid_checks=true, max_open_files=5000, level0_slowdown_writes_trigger=20, level0_stop_writes_trigger=24, disable_seek_compaction=true, max_background_flushes=1 and allow_mmap_writes=false
* Added is_manual_compaction to CompactionFilter::Context * Added is_manual_compaction to CompactionFilter::Context
* Added "virtual void WaitForJoin()" in class Env. Default operation is no-op. * Added "virtual void WaitForJoin()" in class Env. Default operation is no-op.
* Removed BackupEngine::DeleteBackupsNewerThan() function * Removed BackupEngine::DeleteBackupsNewerThan() function
@ -15,11 +19,18 @@
* Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools * Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools
* Added a command "checkconsistency" in ldb tool, which checks * Added a command "checkconsistency" in ldb tool, which checks
if file system state matches DB state (file existence and file sizes) if file system state matches DB state (file existence and file sizes)
* Separate options related to block based table to a new struct BlockBasedTableOptions
* WriteBatch has a new function Count() to return total size in the batch, and Data() now returns a reference instead of a copy
* Add more counters to perf context.
* Supports several more DB properties: compaction-pending, background-errors and cur-size-active-mem-table.
### New Features ### New Features
* If we find one truncated record at the end of the MANIFEST or WAL files, * If we find one truncated record at the end of the MANIFEST or WAL files,
we will ignore it. We assume that writers of these records were interrupted we will ignore it. We assume that writers of these records were interrupted
and that we can safely ignore it. and that we can safely ignore it.
* A new SST format "PlainTable" is added, which is optimized for memory-only workloads. It can be created through NewPlainTableFactory() or NewTotalOrderPlainTableFactory().
* A new mem table implementation hash linked list optimizing for the case that there are only few keys for each prefix, which can be created through NewHashLinkListRepFactory().
* Merge operator supports a new function PartialMergeMulti() to allow users to do partial merges against multiple operands.
* Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1. * Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1.
* Geo-spatial support for locations and radial-search. * Geo-spatial support for locations and radial-search.

@ -67,6 +67,9 @@ libraries. You are on your own.
* Please note that some of the optimizations/features are disabled in OSX. * Please note that some of the optimizations/features are disabled in OSX.
We did not run any production workloads on it. We did not run any production workloads on it.
* **iOS**:
* Run: `TARGET_OS=IOS make static_lib`
## Compilation ## Compilation
`make clean; make` will compile librocksdb.a (RocksDB static library) and all `make clean; make` will compile librocksdb.a (RocksDB static library) and all
the unit tests. You can run all unit tests with `make check`. the unit tests. You can run all unit tests with `make check`.

@ -23,6 +23,14 @@ $(shell (export ROCKSDB_ROOT=$(CURDIR); $(CURDIR)/build_tools/build_detect_platf
# this file is generated by the previous line to set build flags and sources # this file is generated by the previous line to set build flags and sources
include build_config.mk include build_config.mk
ifneq ($(PLATFORM), IOS)
CFLAGS += -g
CXXFLAGS += -g
else
# no debug info for IOS, that will make our library big
OPT += -DNDEBUG
endif
# ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc. # ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
ifdef COMPILE_WITH_ASAN ifdef COMPILE_WITH_ASAN
# ASAN compile flags # ASAN compile flags
@ -37,8 +45,8 @@ else
endif endif
WARNING_FLAGS = -Wall -Werror -Wno-sign-compare WARNING_FLAGS = -Wall -Werror -Wno-sign-compare
CFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
CXXFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
LDFLAGS += $(PLATFORM_LDFLAGS) LDFLAGS += $(PLATFORM_LDFLAGS)
@ -57,6 +65,7 @@ TESTS = \
db_test \ db_test \
block_hash_index_test \ block_hash_index_test \
autovector_test \ autovector_test \
column_family_test \
table_properties_collector_test \ table_properties_collector_test \
arena_test \ arena_test \
auto_roll_logger_test \ auto_roll_logger_test \
@ -148,11 +157,15 @@ $(SHARED3):
endif # PLATFORM_SHARED_EXT endif # PLATFORM_SHARED_EXT
.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
release tags valgrind_check whitebox_crash_test format shared_lib all \ release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
dbg dbg
all: $(LIBRARY) $(PROGRAMS) all: $(LIBRARY) $(PROGRAMS)
static_lib: $(LIBRARY)
shared_lib: $(SHARED)
dbg: $(LIBRARY) $(PROGRAMS) dbg: $(LIBRARY) $(PROGRAMS)
# Will also generate shared libraries. # Will also generate shared libraries.
@ -218,8 +231,6 @@ tags:
format: format:
build_tools/format-diff.sh build_tools/format-diff.sh
shared_lib: $(SHARED)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Unit tests and tools # Unit tests and tools
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -260,6 +271,9 @@ arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) $(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
column_family_test: db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) $(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -435,20 +449,20 @@ ifeq ($(PLATFORM), IOS)
PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/versionCFBundleShortVersionString) IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
.cc.o: .cc.o:
mkdir -p ios-x86/$(dir $@) mkdir -p ios-x86/$(dir $@)
$(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ $(COVERAGEFLAGS) $(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
mkdir -p ios-arm/$(dir $@) mkdir -p ios-arm/$(dir $@)
$(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ $(COVERAGEFLAGS) xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
lipo ios-x86/$@ ios-arm/$@ -create -output $@ lipo ios-x86/$@ ios-arm/$@ -create -output $@
.c.o: .c.o:
mkdir -p ios-x86/$(dir $@) mkdir -p ios-x86/$(dir $@)
$(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ $(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
mkdir -p ios-arm/$(dir $@) mkdir -p ios-arm/$(dir $@)
$(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
lipo ios-x86/$@ ios-arm/$@ -create -output $@ lipo ios-x86/$@ ios-arm/$@ -create -output $@
else else

@ -87,7 +87,7 @@ PLATFORM_SHARED_CFLAGS="-fPIC"
PLATFORM_SHARED_VERSIONED=false PLATFORM_SHARED_VERSIONED=false
# generic port files (working on all platform by #ifdef) go directly in /port # generic port files (working on all platform by #ifdef) go directly in /port
GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "` GENERIC_PORT_FILES=`cd $ROCKSDB_ROOT; find port -name '*.cc' | tr "\n" " "`
# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp # On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
case "$TARGET_OS" in case "$TARGET_OS" in
@ -98,6 +98,13 @@ case "$TARGET_OS" in
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name " PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
# PORT_FILES=port/darwin/darwin_specific.cc # PORT_FILES=port/darwin/darwin_specific.cc
;; ;;
IOS)
PLATFORM=IOS
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE"
PLATFORM_SHARED_EXT=dylib
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
CROSS_COMPILE=true
;;
Linux) Linux)
PLATFORM=OS_LINUX PLATFORM=OS_LINUX
COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX" COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"

@ -25,12 +25,14 @@
#include "rocksdb/universal_compaction.h" #include "rocksdb/universal_compaction.h"
#include "rocksdb/statistics.h" #include "rocksdb/statistics.h"
#include "rocksdb/slice_transform.h" #include "rocksdb/slice_transform.h"
#include "rocksdb/table.h"
using rocksdb::Cache; using rocksdb::Cache;
using rocksdb::Comparator; using rocksdb::Comparator;
using rocksdb::CompressionType; using rocksdb::CompressionType;
using rocksdb::DB; using rocksdb::DB;
using rocksdb::Env; using rocksdb::Env;
using rocksdb::InfoLogLevel;
using rocksdb::FileLock; using rocksdb::FileLock;
using rocksdb::FilterPolicy; using rocksdb::FilterPolicy;
using rocksdb::FlushOptions; using rocksdb::FlushOptions;
@ -656,6 +658,11 @@ void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) {
} }
} }
void rocksdb_options_set_info_log_level(
rocksdb_options_t* opt, int v) {
opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
}
void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) { void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
opt->rep.write_buffer_size = s; opt->rep.write_buffer_size = s;
} }
@ -714,6 +721,14 @@ void rocksdb_options_set_max_grandparent_overlap_factor(
opt->rep.max_grandparent_overlap_factor = n; opt->rep.max_grandparent_overlap_factor = n;
} }
void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
rocksdb_options_t* opt, int* level_values, size_t num_levels) {
opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels);
for (size_t i = 0; i < num_levels; ++i) {
opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i];
}
}
void rocksdb_options_enable_statistics(rocksdb_options_t* opt) { void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
opt->rep.statistics = rocksdb::CreateDBStatistics(); opt->rep.statistics = rocksdb::CreateDBStatistics();
} }
@ -857,6 +872,24 @@ void rocksdb_options_set_advise_random_on_open(
opt->rep.advise_random_on_open = v; opt->rep.advise_random_on_open = v;
} }
void rocksdb_options_set_access_hint_on_compaction_start(
rocksdb_options_t* opt, int v) {
switch(v) {
case 0:
opt->rep.access_hint_on_compaction_start = rocksdb::Options::NONE;
break;
case 1:
opt->rep.access_hint_on_compaction_start = rocksdb::Options::NORMAL;
break;
case 2:
opt->rep.access_hint_on_compaction_start = rocksdb::Options::SEQUENTIAL;
break;
case 3:
opt->rep.access_hint_on_compaction_start = rocksdb::Options::WILLNEED;
break;
}
}
void rocksdb_options_set_use_adaptive_mutex( void rocksdb_options_set_use_adaptive_mutex(
rocksdb_options_t* opt, unsigned char v) { rocksdb_options_t* opt, unsigned char v) {
opt->rep.use_adaptive_mutex = v; opt->rep.use_adaptive_mutex = v;
@ -867,6 +900,11 @@ void rocksdb_options_set_bytes_per_sync(
opt->rep.bytes_per_sync = v; opt->rep.bytes_per_sync = v;
} }
void rocksdb_options_set_verify_checksums_in_compaction(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.verify_checksums_in_compaction = v;
}
void rocksdb_options_set_filter_deletes( void rocksdb_options_set_filter_deletes(
rocksdb_options_t* opt, unsigned char v) { rocksdb_options_t* opt, unsigned char v) {
opt->rep.filter_deletes = v; opt->rep.filter_deletes = v;
@ -1003,11 +1041,48 @@ void rocksdb_options_set_hash_link_list_rep(
opt->rep.memtable_factory.reset(factory); opt->rep.memtable_factory.reset(factory);
} }
void rocksdb_options_set_plain_table_factory(
rocksdb_options_t *opt, uint32_t user_key_len, int bloom_bits_per_key,
double hash_table_ratio, size_t index_sparseness) {
static rocksdb::TableFactory* factory = 0;
if (!factory) {
factory = rocksdb::NewPlainTableFactory(
user_key_len, bloom_bits_per_key,
hash_table_ratio, index_sparseness);
}
opt->rep.table_factory.reset(factory);
}
void rocksdb_options_set_max_successive_merges( void rocksdb_options_set_max_successive_merges(
rocksdb_options_t* opt, size_t v) { rocksdb_options_t* opt, size_t v) {
opt->rep.max_successive_merges = v; opt->rep.max_successive_merges = v;
} }
void rocksdb_options_set_min_partial_merge_operands(
rocksdb_options_t* opt, uint32_t v) {
opt->rep.min_partial_merge_operands = v;
}
void rocksdb_options_set_bloom_locality(
rocksdb_options_t* opt, uint32_t v) {
opt->rep.bloom_locality = v;
}
void rocksdb_options_set_allow_thread_local(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.allow_thread_local = v;
}
void rocksdb_options_set_inplace_update_support(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.inplace_update_support = v;
}
void rocksdb_options_set_inplace_update_num_locks(
rocksdb_options_t* opt, size_t v) {
opt->rep.inplace_update_num_locks = v;
}
void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) { void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) {
opt->rep.compaction_style = static_cast<rocksdb::CompactionStyle>(style); opt->rep.compaction_style = static_cast<rocksdb::CompactionStyle>(style);
} }
@ -1022,21 +1097,14 @@ DB::OpenForReadOnly
DB::MultiGet DB::MultiGet
DB::KeyMayExist DB::KeyMayExist
DB::GetOptions DB::GetOptions
DB::GetLiveFiles
DB::GetSortedWalFiles DB::GetSortedWalFiles
DB::GetLatestSequenceNumber DB::GetLatestSequenceNumber
DB::GetUpdatesSince DB::GetUpdatesSince
DB::DeleteFile
DB::GetDbIdentity DB::GetDbIdentity
DB::RunManualCompaction DB::RunManualCompaction
custom cache custom cache
compaction_filter compaction_filter
max_bytes_for_level_multiplier_additional
access_hint_on_compaction_start
table_factory
table_properties_collectors table_properties_collectors
inplace_update_support
inplace_update_num_locks
*/ */
rocksdb_comparator_t* rocksdb_comparator_create( rocksdb_comparator_t* rocksdb_comparator_create(

@ -443,6 +443,7 @@ int main(int argc, char** argv) {
rocksdb_options_set_filter_policy(options, policy); rocksdb_options_set_filter_policy(options, policy);
rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3)); rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3));
rocksdb_options_set_hash_skip_list_rep(options, 50000, 4, 4); rocksdb_options_set_hash_skip_list_rep(options, 50000, 4, 4);
rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
db = rocksdb_open(options, dbname, &err); db = rocksdb_open(options, dbname, &err);
CheckNoError(err); CheckNoError(err);

@ -0,0 +1,489 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/column_family.h"
#include <vector>
#include <string>
#include <algorithm>
#include "db/db_impl.h"
#include "db/version_set.h"
#include "db/internal_stats.h"
#include "db/compaction_picker.h"
#include "db/table_properties_collector.h"
#include "util/autovector.h"
#include "util/hash_skiplist_rep.h"
namespace rocksdb {
ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
DBImpl* db, port::Mutex* mutex)
: cfd_(cfd), db_(db), mutex_(mutex) {
if (cfd_ != nullptr) {
cfd_->Ref();
}
}
ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
if (cfd_ != nullptr) {
DBImpl::DeletionState deletion_state;
mutex_->Lock();
if (cfd_->Unref()) {
delete cfd_;
}
db_->FindObsoleteFiles(deletion_state, false, true);
mutex_->Unlock();
if (deletion_state.HaveSomethingToDelete()) {
db_->PurgeObsoleteFiles(deletion_state);
}
}
}
uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
namespace {
// Fix user-supplied options to be reasonable
template <class T, class V>
static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
}
} // anonymous namespace
ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const ColumnFamilyOptions& src) {
ColumnFamilyOptions result = src;
result.comparator = icmp;
result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
ClipToRange(&result.write_buffer_size,
((size_t)64) << 10, ((size_t)64) << 30);
// if user sets arena_block_size, we trust user to use this value. Otherwise,
// calculate a proper value from writer_buffer_size;
if (result.arena_block_size <= 0) {
result.arena_block_size = result.write_buffer_size / 10;
}
result.min_write_buffer_number_to_merge =
std::min(result.min_write_buffer_number_to_merge,
result.max_write_buffer_number - 1);
if (result.block_cache == nullptr && !result.no_block_cache) {
result.block_cache = NewLRUCache(8 << 20);
}
result.compression_per_level = src.compression_per_level;
if (result.block_size_deviation < 0 || result.block_size_deviation > 100) {
result.block_size_deviation = 0;
}
if (result.max_mem_compaction_level >= result.num_levels) {
result.max_mem_compaction_level = result.num_levels - 1;
}
if (result.soft_rate_limit > result.hard_rate_limit) {
result.soft_rate_limit = result.hard_rate_limit;
}
if (!result.prefix_extractor) {
assert(result.memtable_factory);
Slice name = result.memtable_factory->Name();
if (name.compare("HashSkipListRepFactory") == 0 ||
name.compare("HashLinkListRepFactory") == 0) {
result.memtable_factory = std::make_shared<SkipListFactory>();
}
}
// -- Sanitize the table properties collector
// All user defined properties collectors will be wrapped by
// UserKeyTablePropertiesCollector since for them they only have the
// knowledge of the user keys; internal keys are invisible to them.
auto& collectors = result.table_properties_collectors;
for (size_t i = 0; i < result.table_properties_collectors.size(); ++i) {
assert(collectors[i]);
collectors[i] =
std::make_shared<UserKeyTablePropertiesCollector>(collectors[i]);
}
// Add collector to collect internal key statistics
collectors.push_back(std::make_shared<InternalKeyPropertiesCollector>());
return result;
}
int SuperVersion::dummy = 0;
void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
void* const SuperVersion::kSVObsolete = nullptr;
SuperVersion::~SuperVersion() {
for (auto td : to_delete) {
delete td;
}
}
SuperVersion* SuperVersion::Ref() {
refs.fetch_add(1, std::memory_order_relaxed);
return this;
}
bool SuperVersion::Unref() {
// fetch_sub returns the previous value of ref
uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed);
assert(previous_refs > 0);
return previous_refs == 1;
}
void SuperVersion::Cleanup() {
assert(refs.load(std::memory_order_relaxed) == 0);
imm->Unref(&to_delete);
MemTable* m = mem->Unref();
if (m != nullptr) {
to_delete.push_back(m);
}
current->Unref();
}
void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm,
Version* new_current) {
mem = new_mem;
imm = new_imm;
current = new_current;
mem->Ref();
imm->Ref();
current->Ref();
refs.store(1, std::memory_order_relaxed);
}
namespace {
void SuperVersionUnrefHandle(void* ptr) {
// UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
// destroyed. When former happens, the thread shouldn't see kSVInUse.
// When latter happens, we are in ~ColumnFamilyData(), no get should happen as
// well.
SuperVersion* sv = static_cast<SuperVersion*>(ptr);
if (sv->Unref()) {
sv->db_mutex->Lock();
sv->Cleanup();
sv->db_mutex->Unlock();
delete sv;
}
}
} // anonymous namespace
ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id,
const std::string& name,
Version* dummy_versions, Cache* table_cache,
const ColumnFamilyOptions& options,
const DBOptions* db_options,
const EnvOptions& storage_options,
ColumnFamilySet* column_family_set)
: id_(id),
name_(name),
dummy_versions_(dummy_versions),
current_(nullptr),
refs_(0),
dropped_(false),
internal_comparator_(options.comparator),
internal_filter_policy_(options.filter_policy),
options_(*db_options, SanitizeOptions(&internal_comparator_,
&internal_filter_policy_, options)),
mem_(nullptr),
imm_(options.min_write_buffer_number_to_merge),
super_version_(nullptr),
super_version_number_(0),
local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
next_(nullptr),
prev_(nullptr),
log_number_(0),
need_slowdown_for_num_level0_files_(false),
column_family_set_(column_family_set) {
Ref();
// if dummy_versions is nullptr, then this is a dummy column family.
if (dummy_versions != nullptr) {
internal_stats_.reset(new InternalStats(options.num_levels, db_options->env,
db_options->statistics.get()));
table_cache_.reset(
new TableCache(dbname, &options_, storage_options, table_cache));
if (options_.compaction_style == kCompactionStyleUniversal) {
compaction_picker_.reset(
new UniversalCompactionPicker(&options_, &internal_comparator_));
} else {
compaction_picker_.reset(
new LevelCompactionPicker(&options_, &internal_comparator_));
}
Log(options_.info_log, "Options for column family \"%s\":\n",
name.c_str());
const ColumnFamilyOptions* cf_options = &options_;
cf_options->Dump(options_.info_log.get());
}
}
// DB mutex held
ColumnFamilyData::~ColumnFamilyData() {
assert(refs_ == 0);
// remove from linked list
auto prev = prev_;
auto next = next_;
prev->next_ = next;
next->prev_ = prev;
// it's nullptr for dummy CFD
if (column_family_set_ != nullptr) {
// remove from column_family_set
column_family_set_->RemoveColumnFamily(this);
}
if (current_ != nullptr) {
current_->Unref();
}
if (super_version_ != nullptr) {
// Release SuperVersion reference kept in ThreadLocalPtr.
// This must be done outside of mutex_ since unref handler can lock mutex.
super_version_->db_mutex->Unlock();
local_sv_.reset();
super_version_->db_mutex->Lock();
bool is_last_reference __attribute__((unused));
is_last_reference = super_version_->Unref();
assert(is_last_reference);
super_version_->Cleanup();
delete super_version_;
super_version_ = nullptr;
}
if (dummy_versions_ != nullptr) {
// List must be empty
assert(dummy_versions_->next_ == dummy_versions_);
delete dummy_versions_;
}
if (mem_ != nullptr) {
delete mem_->Unref();
}
autovector<MemTable*> to_delete;
imm_.current()->Unref(&to_delete);
for (MemTable* m : to_delete) {
delete m;
}
}
void ColumnFamilyData::SetCurrent(Version* current) {
current_ = current;
need_slowdown_for_num_level0_files_ =
(options_.level0_slowdown_writes_trigger >= 0 &&
current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
}
void ColumnFamilyData::CreateNewMemtable() {
assert(current_ != nullptr);
if (mem_ != nullptr) {
delete mem_->Unref();
}
mem_ = new MemTable(internal_comparator_, options_);
mem_->Ref();
}
Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
return compaction_picker_->PickCompaction(current_, log_buffer);
}
Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
const InternalKey* begin,
const InternalKey* end,
InternalKey** compaction_end) {
return compaction_picker_->CompactRange(current_, input_level, output_level,
begin, end, compaction_end);
}
SuperVersion* ColumnFamilyData::InstallSuperVersion(
SuperVersion* new_superversion, port::Mutex* db_mutex) {
new_superversion->db_mutex = db_mutex;
new_superversion->Init(mem_, imm_.current(), current_);
SuperVersion* old_superversion = super_version_;
super_version_ = new_superversion;
++super_version_number_;
super_version_->version_number = super_version_number_;
if (old_superversion != nullptr && old_superversion->Unref()) {
old_superversion->Cleanup();
return old_superversion; // will let caller delete outside of mutex
}
return nullptr;
}
void ColumnFamilyData::ResetThreadLocalSuperVersions() {
autovector<void*> sv_ptrs;
local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
for (auto ptr : sv_ptrs) {
assert(ptr);
if (ptr == SuperVersion::kSVInUse) {
continue;
}
auto sv = static_cast<SuperVersion*>(ptr);
if (sv->Unref()) {
sv->Cleanup();
delete sv;
}
}
}
ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
const DBOptions* db_options,
const EnvOptions& storage_options,
Cache* table_cache)
: max_column_family_(0),
dummy_cfd_(new ColumnFamilyData(dbname, 0, "", nullptr, nullptr,
ColumnFamilyOptions(), db_options,
storage_options_, nullptr)),
default_cfd_cache_(nullptr),
db_name_(dbname),
db_options_(db_options),
storage_options_(storage_options),
table_cache_(table_cache),
spin_lock_(ATOMIC_FLAG_INIT) {
// initialize linked list
dummy_cfd_->prev_ = dummy_cfd_;
dummy_cfd_->next_ = dummy_cfd_;
}
ColumnFamilySet::~ColumnFamilySet() {
while (column_family_data_.size() > 0) {
// cfd destructor will delete itself from column_family_data_
auto cfd = column_family_data_.begin()->second;
cfd->Unref();
delete cfd;
}
dummy_cfd_->Unref();
delete dummy_cfd_;
}
ColumnFamilyData* ColumnFamilySet::GetDefault() const {
assert(default_cfd_cache_ != nullptr);
return default_cfd_cache_;
}
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
auto cfd_iter = column_family_data_.find(id);
if (cfd_iter != column_family_data_.end()) {
return cfd_iter->second;
} else {
return nullptr;
}
}
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name)
const {
auto cfd_iter = column_families_.find(name);
if (cfd_iter != column_families_.end()) {
auto cfd = GetColumnFamily(cfd_iter->second);
assert(cfd != nullptr);
return cfd;
} else {
return nullptr;
}
}
uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
return ++max_column_family_;
}
uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
max_column_family_ = std::max(new_max_column_family, max_column_family_);
}
// under a DB mutex
ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
const std::string& name, uint32_t id, Version* dummy_versions,
const ColumnFamilyOptions& options) {
assert(column_families_.find(name) == column_families_.end());
ColumnFamilyData* new_cfd =
new ColumnFamilyData(db_name_, id, name, dummy_versions, table_cache_,
options, db_options_, storage_options_, this);
Lock();
column_families_.insert({name, id});
column_family_data_.insert({id, new_cfd});
Unlock();
max_column_family_ = std::max(max_column_family_, id);
// add to linked list
new_cfd->next_ = dummy_cfd_;
auto prev = dummy_cfd_->prev_;
new_cfd->prev_ = prev;
prev->next_ = new_cfd;
dummy_cfd_->prev_ = new_cfd;
if (id == 0) {
default_cfd_cache_ = new_cfd;
}
return new_cfd;
}
void ColumnFamilySet::Lock() {
// spin lock
while (spin_lock_.test_and_set(std::memory_order_acquire)) {
}
}
void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); }
// REQUIRES: DB mutex held
void ColumnFamilySet::FreeDeadColumnFamilies() {
autovector<ColumnFamilyData*> to_delete;
for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
if (cfd->refs_ == 0) {
to_delete.push_back(cfd);
}
}
for (auto cfd : to_delete) {
// this is very rare, so it's not a problem that we do it under a mutex
delete cfd;
}
}
// under a DB mutex
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
auto cfd_iter = column_family_data_.find(cfd->GetID());
assert(cfd_iter != column_family_data_.end());
Lock();
column_family_data_.erase(cfd_iter);
column_families_.erase(cfd->GetName());
Unlock();
}
bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
if (column_family_id == 0) {
// optimization for common case
current_ = column_family_set_->GetDefault();
} else {
// maybe outside of db mutex, should lock
column_family_set_->Lock();
current_ = column_family_set_->GetColumnFamily(column_family_id);
column_family_set_->Unlock();
}
handle_.SetCFD(current_);
return current_ != nullptr;
}
uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
assert(current_ != nullptr);
return current_->GetLogNumber();
}
MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
assert(current_ != nullptr);
return current_->mem();
}
const Options* ColumnFamilyMemTablesImpl::GetOptions() const {
assert(current_ != nullptr);
return current_->options();
}
ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
assert(current_ != nullptr);
return &handle_;
}
} // namespace rocksdb

@ -0,0 +1,408 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <unordered_map>
#include <string>
#include <vector>
#include <atomic>
#include "rocksdb/options.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "db/memtable_list.h"
#include "db/write_batch_internal.h"
#include "db/table_cache.h"
#include "util/thread_local.h"
namespace rocksdb {
class Version;
class VersionSet;
class MemTable;
class MemTableListVersion;
class CompactionPicker;
class Compaction;
class InternalKey;
class InternalStats;
class ColumnFamilyData;
class DBImpl;
class LogBuffer;
// ColumnFamilyHandleImpl is the class that clients use to access different
// column families. It has non-trivial destructor, which gets called when client
// is done using the column family
class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
public:
// create while holding the mutex
ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex);
// destroy without mutex
virtual ~ColumnFamilyHandleImpl();
virtual ColumnFamilyData* cfd() const { return cfd_; }
virtual uint32_t GetID() const;
private:
ColumnFamilyData* cfd_;
DBImpl* db_;
port::Mutex* mutex_;
};
// Does not ref-count ColumnFamilyData
// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
// calls DBImpl methods. When this happens, MemTableInserter need access to
// ColumnFamilyHandle (same as the client would need). In that case, we feed
// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
// methods
class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
public:
ColumnFamilyHandleInternal()
: ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {}
void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; }
virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
private:
ColumnFamilyData* internal_cfd_;
};
// holds references to memtable, all immutable memtables and version
struct SuperVersion {
MemTable* mem;
MemTableListVersion* imm;
Version* current;
std::atomic<uint32_t> refs;
// We need to_delete because during Cleanup(), imm->Unref() returns
// all memtables that we need to free through this vector. We then
// delete all those memtables outside of mutex, during destruction
autovector<MemTable*> to_delete;
// Version number of the current SuperVersion
uint64_t version_number;
port::Mutex* db_mutex;
// should be called outside the mutex
SuperVersion() = default;
~SuperVersion();
SuperVersion* Ref();
bool Unref();
// call these two methods with db mutex held
// Cleanup unrefs mem, imm and current. Also, it stores all memtables
// that needs to be deleted in to_delete vector. Unrefing those
// objects needs to be done in the mutex
void Cleanup();
void Init(MemTable* new_mem, MemTableListVersion* new_imm,
Version* new_current);
// The value of dummy is not actually used. kSVInUse takes its address as a
// mark in the thread local storage to indicate the SuperVersion is in use
// by thread. This way, the value of kSVInUse is guaranteed to have no
// conflict with SuperVersion object address and portable on different
// platform.
static int dummy;
static void* const kSVInUse;
static void* const kSVObsolete;
};
extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const ColumnFamilyOptions& src);
class ColumnFamilySet;
// This class keeps all the data that a column family needs. It's mosly dumb and
// used just to provide access to metadata.
// Most methods require DB mutex held, unless otherwise noted
class ColumnFamilyData {
public:
~ColumnFamilyData();
// thread-safe
uint32_t GetID() const { return id_; }
// thread-safe
const std::string& GetName() const { return name_; }
void Ref() { ++refs_; }
// will just decrease reference count to 0, but will not delete it. returns
// true if the ref count was decreased to zero. in that case, it can be
// deleted by the caller immediatelly, or later, by calling
// FreeDeadColumnFamilies()
bool Unref() {
assert(refs_ > 0);
return --refs_ == 0;
}
// This can only be called from single-threaded VersionSet::LogAndApply()
// After dropping column family no other operation on that column family
// will be executed. All the files and memory will be, however, kept around
// until client drops the column family handle. That way, client can still
// access data from dropped column family.
// Column family can be dropped and still alive. In that state:
// *) Column family is not included in the iteration.
// *) Compaction and flush is not executed on the dropped column family.
// *) Client can continue writing and reading from column family. However, all
// writes stay in the current memtable.
// When the dropped column family is unreferenced, then we:
// *) delete all memory associated with that column family
// *) delete all the files associated with that column family
void SetDropped() {
// can't drop default CF
assert(id_ != 0);
dropped_ = true;
}
bool IsDropped() const { return dropped_; }
// thread-safe
int NumberLevels() const { return options_.num_levels; }
void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
uint64_t GetLogNumber() const { return log_number_; }
// thread-safe
const Options* options() const { return &options_; }
InternalStats* internal_stats() { return internal_stats_.get(); }
MemTableList* imm() { return &imm_; }
MemTable* mem() { return mem_; }
Version* current() { return current_; }
Version* dummy_versions() { return dummy_versions_; }
void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
void SetCurrent(Version* current);
void CreateNewMemtable();
TableCache* table_cache() { return table_cache_.get(); }
// See documentation in compaction_picker.h
Compaction* PickCompaction(LogBuffer* log_buffer);
Compaction* CompactRange(int input_level, int output_level,
const InternalKey* begin, const InternalKey* end,
InternalKey** compaction_end);
CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
// thread-safe
const Comparator* user_comparator() const {
return internal_comparator_.user_comparator();
}
// thread-safe
const InternalKeyComparator& internal_comparator() const {
return internal_comparator_;
}
SuperVersion* GetSuperVersion() { return super_version_; }
// thread-safe
ThreadLocalPtr* GetThreadLocalSuperVersion() const { return local_sv_.get(); }
// thread-safe
uint64_t GetSuperVersionNumber() const {
return super_version_number_.load();
}
// will return a pointer to SuperVersion* if previous SuperVersion
// if its reference count is zero and needs deletion or nullptr if not
// As argument takes a pointer to allocated SuperVersion to enable
// the clients to allocate SuperVersion outside of mutex.
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
port::Mutex* db_mutex);
void ResetThreadLocalSuperVersions();
// A Flag indicating whether write needs to slowdown because of there are
// too many number of level0 files.
bool NeedSlowdownForNumLevel0Files() const {
return need_slowdown_for_num_level0_files_;
}
private:
friend class ColumnFamilySet;
ColumnFamilyData(const std::string& dbname, uint32_t id,
const std::string& name, Version* dummy_versions,
Cache* table_cache, const ColumnFamilyOptions& options,
const DBOptions* db_options,
const EnvOptions& storage_options,
ColumnFamilySet* column_family_set);
uint32_t id_;
const std::string name_;
Version* dummy_versions_; // Head of circular doubly-linked list of versions.
Version* current_; // == dummy_versions->prev_
int refs_; // outstanding references to ColumnFamilyData
bool dropped_; // true if client dropped it
const InternalKeyComparator internal_comparator_;
const InternalFilterPolicy internal_filter_policy_;
Options const options_;
std::unique_ptr<TableCache> table_cache_;
std::unique_ptr<InternalStats> internal_stats_;
MemTable* mem_;
MemTableList imm_;
SuperVersion* super_version_;
// An ordinal representing the current SuperVersion. Updated by
// InstallSuperVersion(), i.e. incremented every time super_version_
// changes.
std::atomic<uint64_t> super_version_number_;
// Thread's local copy of SuperVersion pointer
// This needs to be destructed before mutex_
std::unique_ptr<ThreadLocalPtr> local_sv_;
// pointers for a circular linked list. we use it to support iterations
// that can be concurrent with writes
ColumnFamilyData* next_;
ColumnFamilyData* prev_;
// This is the earliest log file number that contains data from this
// Column Family. All earlier log files must be ignored and not
// recovered from
uint64_t log_number_;
// A flag indicating whether we should delay writes because
// we have too many level 0 files
bool need_slowdown_for_num_level0_files_;
// An object that keeps all the compaction stats
// and picks the next compaction
std::unique_ptr<CompactionPicker> compaction_picker_;
ColumnFamilySet* column_family_set_;
};
// ColumnFamilySet has interesting thread-safety requirements
// * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB
// mutex. Inside, column_family_data_ and column_families_ will be protected
// by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from
// VersionSet::LogAndApply() in the normal runtime. It is also called
// during Recovery and in DumpManifest(). RemoveColumnFamily() is called
// from ColumnFamilyData destructor
// * Iteration -- hold DB mutex, but you can release it in the body of
// iteration. If you release DB mutex in body, reference the column
// family before the mutex and unreference after you unlock, since the column
// family might get dropped when the DB mutex is released
// * GetDefault() -- thread safe
// * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock()
// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily() --
// inside of DB mutex
class ColumnFamilySet {
public:
// ColumnFamilySet supports iteration
class iterator {
public:
explicit iterator(ColumnFamilyData* cfd)
: current_(cfd) {}
iterator& operator++() {
// dummy is never dead or dropped, so this will never be infinite
do {
current_ = current_->next_;
} while (current_->refs_ == 0 || current_->IsDropped());
return *this;
}
bool operator!=(const iterator& other) {
return this->current_ != other.current_;
}
ColumnFamilyData* operator*() { return current_; }
private:
ColumnFamilyData* current_;
};
ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
const EnvOptions& storage_options, Cache* table_cache);
~ColumnFamilySet();
ColumnFamilyData* GetDefault() const;
// GetColumnFamily() calls return nullptr if column family is not found
ColumnFamilyData* GetColumnFamily(uint32_t id) const;
ColumnFamilyData* GetColumnFamily(const std::string& name) const;
// this call will return the next available column family ID. it guarantees
// that there is no column family with id greater than or equal to the
// returned value in the current running instance or anytime in RocksDB
// instance history.
uint32_t GetNextColumnFamilyID();
uint32_t GetMaxColumnFamily();
void UpdateMaxColumnFamily(uint32_t new_max_column_family);
ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
Version* dummy_version,
const ColumnFamilyOptions& options);
iterator begin() { return iterator(dummy_cfd_->next_); }
iterator end() { return iterator(dummy_cfd_); }
void Lock();
void Unlock();
// REQUIRES: DB mutex held
// Don't call while iterating over ColumnFamilySet
void FreeDeadColumnFamilies();
private:
friend class ColumnFamilyData;
// helper function that gets called from cfd destructor
// REQUIRES: DB mutex held
void RemoveColumnFamily(ColumnFamilyData* cfd);
// column_families_ and column_family_data_ need to be protected:
// * when mutating: 1. DB mutex locked first, 2. spinlock locked second
// * when reading, either: 1. lock DB mutex, or 2. lock spinlock
// (if both, respect the ordering to avoid deadlock!)
std::unordered_map<std::string, uint32_t> column_families_;
std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
uint32_t max_column_family_;
ColumnFamilyData* dummy_cfd_;
// We don't hold the refcount here, since default column family always exists
// We are also not responsible for cleaning up default_cfd_cache_. This is
// just a cache that makes common case (accessing default column family)
// faster
ColumnFamilyData* default_cfd_cache_;
const std::string db_name_;
const DBOptions* const db_options_;
const EnvOptions storage_options_;
Cache* table_cache_;
std::atomic_flag spin_lock_;
};
// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
// memtables of different column families (specified by ID in the write batch)
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
public:
explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
: column_family_set_(column_family_set), current_(nullptr) {}
// sets current_ to ColumnFamilyData with column_family_id
// returns false if column family doesn't exist
bool Seek(uint32_t column_family_id) override;
// Returns log number of the selected column family
uint64_t GetLogNumber() const override;
// REQUIRES: Seek() called first
virtual MemTable* GetMemTable() const override;
// Returns options for selected column family
// REQUIRES: Seek() called first
virtual const Options* GetOptions() const override;
// Returns column family handle for the selected column family
virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
private:
ColumnFamilySet* column_family_set_;
ColumnFamilyData* current_;
ColumnFamilyHandleInternal handle_;
};
} // namespace rocksdb

@ -0,0 +1,857 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <algorithm>
#include <vector>
#include <string>
#include "db/db_impl.h"
#include "rocksdb/env.h"
#include "rocksdb/db.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "util/coding.h"
#include "utilities/merge_operators.h"
namespace rocksdb {
namespace {
std::string RandomString(Random* rnd, int len) {
std::string r;
test::RandomString(rnd, len, &r);
return r;
}
} // anonymous namespace
class ColumnFamilyTest {
public:
ColumnFamilyTest() : rnd_(139) {
env_ = Env::Default();
dbname_ = test::TmpDir() + "/column_family_test";
db_options_.create_if_missing = true;
DestroyDB(dbname_, Options(db_options_, column_family_options_));
}
void Close() {
for (auto h : handles_) {
delete h;
}
handles_.clear();
names_.clear();
delete db_;
db_ = nullptr;
}
Status TryOpen(std::vector<std::string> cf,
std::vector<ColumnFamilyOptions> options = {}) {
std::vector<ColumnFamilyDescriptor> column_families;
names_.clear();
for (size_t i = 0; i < cf.size(); ++i) {
column_families.push_back(ColumnFamilyDescriptor(
cf[i], options.size() == 0 ? column_family_options_ : options[i]));
names_.push_back(cf[i]);
}
return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
}
void Open(std::vector<std::string> cf,
std::vector<ColumnFamilyOptions> options = {}) {
ASSERT_OK(TryOpen(cf, options));
}
void Open() {
Open({"default"});
}
DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
int GetProperty(int cf, std::string property) {
std::string value;
ASSERT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
return std::stoi(value);
}
void Destroy() {
for (auto h : handles_) {
delete h;
}
handles_.clear();
names_.clear();
delete db_;
db_ = nullptr;
ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
}
void CreateColumnFamilies(
const std::vector<std::string>& cfs,
const std::vector<ColumnFamilyOptions> options = {}) {
int cfi = handles_.size();
handles_.resize(cfi + cfs.size());
names_.resize(cfi + cfs.size());
for (size_t i = 0; i < cfs.size(); ++i) {
ASSERT_OK(db_->CreateColumnFamily(
options.size() == 0 ? column_family_options_ : options[i], cfs[i],
&handles_[cfi]));
names_[cfi] = cfs[i];
cfi++;
}
}
void Reopen(const std::vector<ColumnFamilyOptions> options = {}) {
std::vector<std::string> names;
for (auto name : names_) {
if (name != "") {
names.push_back(name);
}
}
Close();
assert(options.size() == 0 || names.size() == options.size());
Open(names, options);
}
void CreateColumnFamiliesAndReopen(const std::vector<std::string>& cfs) {
CreateColumnFamilies(cfs);
Reopen();
}
void DropColumnFamilies(const std::vector<int>& cfs) {
for (auto cf : cfs) {
ASSERT_OK(db_->DropColumnFamily(handles_[cf]));
delete handles_[cf];
handles_[cf] = nullptr;
names_[cf] = "";
}
}
void PutRandomData(int cf, int num, int key_value_size) {
for (int i = 0; i < num; ++i) {
// 10 bytes for key, rest is value
ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 10),
RandomString(&rnd_, key_value_size - 10)));
}
}
void WaitForFlush(int cf) {
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
}
void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); }
Status Put(int cf, const std::string& key, const std::string& value) {
return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
}
Status Merge(int cf, const std::string& key, const std::string& value) {
return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
}
Status Flush(int cf) {
return db_->Flush(FlushOptions(), handles_[cf]);
}
std::string Get(int cf, const std::string& key) {
ReadOptions options;
options.verify_checksums = true;
std::string result;
Status s = db_->Get(options, handles_[cf], Slice(key), &result);
if (s.IsNotFound()) {
result = "NOT_FOUND";
} else if (!s.ok()) {
result = s.ToString();
}
return result;
}
void CompactAll(int cf) {
ASSERT_OK(db_->CompactRange(handles_[cf], nullptr, nullptr));
}
void Compact(int cf, const Slice& start, const Slice& limit) {
ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit));
}
int NumTableFilesAtLevel(int level, int cf) {
return GetProperty(cf,
"rocksdb.num-files-at-level" + std::to_string(level));
}
// Return spread of files per level
std::string FilesPerLevel(int cf) {
std::string result;
int last_non_zero_offset = 0;
for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) {
int f = NumTableFilesAtLevel(level, cf);
char buf[100];
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
result += buf;
if (f > 0) {
last_non_zero_offset = result.size();
}
}
result.resize(last_non_zero_offset);
return result;
}
int CountLiveFiles(int cf) {
std::vector<LiveFileMetaData> metadata;
db_->GetLiveFilesMetaData(&metadata);
return static_cast<int>(metadata.size());
}
// Do n memtable flushes, each of which produces an sstable
// covering the range [small,large].
void MakeTables(int cf, int n, const std::string& small,
const std::string& large) {
for (int i = 0; i < n; i++) {
ASSERT_OK(Put(cf, small, "begin"));
ASSERT_OK(Put(cf, large, "end"));
ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf]));
}
}
int CountLiveLogFiles() {
int micros_wait_for_log_deletion = 20000;
env_->SleepForMicroseconds(micros_wait_for_log_deletion);
int ret = 0;
VectorLogPtr wal_files;
Status s;
// GetSortedWalFiles is a flakey function -- it gets all the wal_dir
// children files and then later checks for their existance. if some of the
// log files doesn't exist anymore, it reports an error. it does all of this
// without DB mutex held, so if a background process deletes the log file
// while the function is being executed, it returns an error. We retry the
// function 10 times to avoid the error failing the test
for (int retries = 0; retries < 10; ++retries) {
wal_files.clear();
s = db_->GetSortedWalFiles(wal_files);
if (s.ok()) {
break;
}
}
ASSERT_OK(s);
for (const auto& wal : wal_files) {
if (wal->Type() == kAliveLogFile) {
++ret;
}
}
return ret;
}
void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
assert(num_per_cf.size() == handles_.size());
for (size_t i = 0; i < num_per_cf.size(); ++i) {
ASSERT_EQ(num_per_cf[i],
GetProperty(i, "rocksdb.num-immutable-mem-table"));
}
}
void CopyFile(const std::string& source, const std::string& destination,
uint64_t size = 0) {
const EnvOptions soptions;
unique_ptr<SequentialFile> srcfile;
ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
unique_ptr<WritableFile> destfile;
ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
if (size == 0) {
// default argument means copy everything
ASSERT_OK(env_->GetFileSize(source, &size));
}
char buffer[4096];
Slice slice;
while (size > 0) {
uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
ASSERT_OK(srcfile->Read(one, &slice, buffer));
ASSERT_OK(destfile->Append(slice));
size -= slice.size();
}
ASSERT_OK(destfile->Close());
}
std::vector<ColumnFamilyHandle*> handles_;
std::vector<std::string> names_;
ColumnFamilyOptions column_family_options_;
DBOptions db_options_;
std::string dbname_;
DB* db_ = nullptr;
Env* env_;
Random rnd_;
};
TEST(ColumnFamilyTest, DontReuseColumnFamilyID) {
for (int iter = 0; iter < 3; ++iter) {
Open();
CreateColumnFamilies({"one", "two", "three"});
for (size_t i = 0; i < handles_.size(); ++i) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[i]);
ASSERT_EQ(i, cfh->GetID());
}
if (iter == 1) {
Reopen();
}
DropColumnFamilies({3});
Reopen();
if (iter == 2) {
// this tests if max_column_family is correctly persisted with
// WriteSnapshot()
Reopen();
}
CreateColumnFamilies({"three2"});
// ID 3 that was used for dropped column family "three" should not be reused
auto cfh3 = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[3]);
ASSERT_EQ(4, cfh3->GetID());
Close();
Destroy();
}
}
TEST(ColumnFamilyTest, AddDrop) {
Open();
CreateColumnFamilies({"one", "two", "three"});
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(2, "fodor"));
DropColumnFamilies({2});
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
CreateColumnFamilies({"four"});
ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
ASSERT_OK(Put(1, "fodor", "mirko"));
ASSERT_EQ("mirko", Get(1, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
Close();
ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument());
Open({"default", "one", "three", "four"});
DropColumnFamilies({1});
Reopen();
Close();
std::vector<std::string> families;
ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
sort(families.begin(), families.end());
ASSERT_TRUE(families ==
std::vector<std::string>({"default", "four", "three"}));
}
TEST(ColumnFamilyTest, DropTest) {
// first iteration - dont reopen DB before dropping
// second iteration - reopen DB before dropping
for (int iter = 0; iter < 2; ++iter) {
Open({"default"});
CreateColumnFamiliesAndReopen({"pikachu"});
for (int i = 0; i < 100; ++i) {
ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
}
ASSERT_OK(Flush(1));
if (iter == 1) {
Reopen();
}
ASSERT_EQ("bar1", Get(1, "1"));
ASSERT_EQ(CountLiveFiles(1), 1);
DropColumnFamilies({1});
// make sure that all files are deleted when we drop the column family
ASSERT_EQ(CountLiveFiles(1), 0);
Destroy();
}
}
TEST(ColumnFamilyTest, WriteBatchFailure) {
Open();
CreateColumnFamiliesAndReopen({"one", "two"});
WriteBatch batch;
batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
ASSERT_OK(db_->Write(WriteOptions(), &batch));
DropColumnFamilies({1});
Status s = db_->Write(WriteOptions(), &batch);
ASSERT_TRUE(s.IsInvalidArgument());
Close();
}
TEST(ColumnFamilyTest, ReadWrite) {
Open();
CreateColumnFamiliesAndReopen({"one", "two"});
ASSERT_OK(Put(0, "foo", "v1"));
ASSERT_OK(Put(0, "bar", "v2"));
ASSERT_OK(Put(1, "mirko", "v3"));
ASSERT_OK(Put(0, "foo", "v2"));
ASSERT_OK(Put(2, "fodor", "v5"));
for (int iter = 0; iter <= 3; ++iter) {
ASSERT_EQ("v2", Get(0, "foo"));
ASSERT_EQ("v2", Get(0, "bar"));
ASSERT_EQ("v3", Get(1, "mirko"));
ASSERT_EQ("v5", Get(2, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
if (iter <= 1) {
Reopen();
}
}
Close();
}
TEST(ColumnFamilyTest, IgnoreRecoveredLog) {
std::string backup_logs = dbname_ + "/backup_logs";
// delete old files in backup_logs directory
ASSERT_OK(env_->CreateDirIfMissing(dbname_));
ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
std::vector<std::string> old_files;
env_->GetChildren(backup_logs, &old_files);
for (auto& file : old_files) {
if (file != "." && file != "..") {
env_->DeleteFile(backup_logs + "/" + file);
}
}
column_family_options_.merge_operator =
MergeOperators::CreateUInt64AddOperator();
db_options_.wal_dir = dbname_ + "/logs";
Destroy();
Open();
CreateColumnFamilies({"cf1", "cf2"});
// fill up the DB
std::string one, two, three;
PutFixed64(&one, 1);
PutFixed64(&two, 2);
PutFixed64(&three, 3);
ASSERT_OK(Merge(0, "foo", one));
ASSERT_OK(Merge(1, "mirko", one));
ASSERT_OK(Merge(0, "foo", one));
ASSERT_OK(Merge(2, "bla", one));
ASSERT_OK(Merge(2, "fodor", one));
ASSERT_OK(Merge(0, "bar", one));
ASSERT_OK(Merge(2, "bla", one));
ASSERT_OK(Merge(1, "mirko", two));
ASSERT_OK(Merge(1, "franjo", one));
// copy the logs to backup
std::vector<std::string> logs;
env_->GetChildren(db_options_.wal_dir, &logs);
for (auto& log : logs) {
if (log != ".." && log != ".") {
CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
}
}
// recover the DB
Close();
// 1. check consistency
// 2. copy the logs from backup back to WAL dir. if the recovery happens
// again on the same log files, this should lead to incorrect results
// due to applying merge operator twice
// 3. check consistency
for (int iter = 0; iter < 2; ++iter) {
// assert consistency
Open({"default", "cf1", "cf2"});
ASSERT_EQ(two, Get(0, "foo"));
ASSERT_EQ(one, Get(0, "bar"));
ASSERT_EQ(three, Get(1, "mirko"));
ASSERT_EQ(one, Get(1, "franjo"));
ASSERT_EQ(one, Get(2, "fodor"));
ASSERT_EQ(two, Get(2, "bla"));
Close();
if (iter == 0) {
// copy the logs from backup back to wal dir
for (auto& log : logs) {
if (log != ".." && log != ".") {
CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
}
}
}
}
}
TEST(ColumnFamilyTest, FlushTest) {
Open();
CreateColumnFamiliesAndReopen({"one", "two"});
ASSERT_OK(Put(0, "foo", "v1"));
ASSERT_OK(Put(0, "bar", "v2"));
ASSERT_OK(Put(1, "mirko", "v3"));
ASSERT_OK(Put(0, "foo", "v2"));
ASSERT_OK(Put(2, "fodor", "v5"));
for (int i = 0; i < 3; ++i) {
Flush(i);
}
Reopen();
for (int iter = 0; iter <= 2; ++iter) {
ASSERT_EQ("v2", Get(0, "foo"));
ASSERT_EQ("v2", Get(0, "bar"));
ASSERT_EQ("v3", Get(1, "mirko"));
ASSERT_EQ("v5", Get(2, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
if (iter <= 1) {
Reopen();
}
}
Close();
}
// Makes sure that obsolete log files get deleted
TEST(ColumnFamilyTest, LogDeletionTest) {
column_family_options_.write_buffer_size = 100000; // 100KB
Open();
CreateColumnFamilies({"one", "two", "three", "four"});
// Each bracket is one log file. if number is in (), it means
// we don't need it anymore (it's been flushed)
// []
ASSERT_EQ(CountLiveLogFiles(), 0);
PutRandomData(0, 1, 100);
// [0]
PutRandomData(1, 1, 100);
// [0, 1]
PutRandomData(1, 1000, 100);
WaitForFlush(1);
// [0, (1)] [1]
ASSERT_EQ(CountLiveLogFiles(), 2);
PutRandomData(0, 1, 100);
// [0, (1)] [0, 1]
ASSERT_EQ(CountLiveLogFiles(), 2);
PutRandomData(2, 1, 100);
// [0, (1)] [0, 1, 2]
PutRandomData(2, 1000, 100);
WaitForFlush(2);
// [0, (1)] [0, 1, (2)] [2]
ASSERT_EQ(CountLiveLogFiles(), 3);
PutRandomData(2, 1000, 100);
WaitForFlush(2);
// [0, (1)] [0, 1, (2)] [(2)] [2]
ASSERT_EQ(CountLiveLogFiles(), 4);
PutRandomData(3, 1, 100);
// [0, (1)] [0, 1, (2)] [(2)] [2, 3]
PutRandomData(1, 1, 100);
// [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
ASSERT_EQ(CountLiveLogFiles(), 4);
PutRandomData(1, 1000, 100);
WaitForFlush(1);
// [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
ASSERT_EQ(CountLiveLogFiles(), 5);
PutRandomData(0, 1000, 100);
WaitForFlush(0);
// [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
// delete obsolete logs -->
// [(1), 2, 3] [1, (0)] [0]
ASSERT_EQ(CountLiveLogFiles(), 3);
PutRandomData(0, 1000, 100);
WaitForFlush(0);
// [(1), 2, 3] [1, (0)], [(0)] [0]
ASSERT_EQ(CountLiveLogFiles(), 4);
PutRandomData(1, 1000, 100);
WaitForFlush(1);
// [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
ASSERT_EQ(CountLiveLogFiles(), 5);
PutRandomData(2, 1000, 100);
WaitForFlush(2);
// [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
ASSERT_EQ(CountLiveLogFiles(), 6);
PutRandomData(3, 1000, 100);
WaitForFlush(3);
// [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
// delete obsolete logs -->
// [0, (1)] [1, (2)], [2, (3)] [3]
ASSERT_EQ(CountLiveLogFiles(), 4);
Close();
}
// Makes sure that obsolete log files get deleted
TEST(ColumnFamilyTest, DifferentWriteBufferSizes) {
Open();
CreateColumnFamilies({"one", "two", "three"});
ColumnFamilyOptions default_cf, one, two, three;
// setup options. all column families have max_write_buffer_number setup to 10
// "default" -> 100KB memtable, start flushing immediatelly
// "one" -> 200KB memtable, start flushing with two immutable memtables
// "two" -> 1MB memtable, start flushing with three immutable memtables
// "three" -> 90KB memtable, start flushing with four immutable memtables
default_cf.write_buffer_size = 100000;
default_cf.max_write_buffer_number = 10;
default_cf.min_write_buffer_number_to_merge = 1;
one.write_buffer_size = 200000;
one.max_write_buffer_number = 10;
one.min_write_buffer_number_to_merge = 2;
two.write_buffer_size = 1000000;
two.max_write_buffer_number = 10;
two.min_write_buffer_number_to_merge = 3;
three.write_buffer_size = 90000;
three.max_write_buffer_number = 10;
three.min_write_buffer_number_to_merge = 4;
Reopen({default_cf, one, two, three});
int micros_wait_for_flush = 10000;
PutRandomData(0, 100, 1000);
WaitForFlush(0);
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 1);
PutRandomData(1, 200, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 2);
PutRandomData(2, 1000, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 1, 0});
ASSERT_EQ(CountLiveLogFiles(), 3);
PutRandomData(2, 1000, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 2, 0});
ASSERT_EQ(CountLiveLogFiles(), 4);
PutRandomData(3, 90, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 2, 1});
ASSERT_EQ(CountLiveLogFiles(), 5);
PutRandomData(3, 90, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 2, 2});
ASSERT_EQ(CountLiveLogFiles(), 6);
PutRandomData(3, 90, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 2, 3});
ASSERT_EQ(CountLiveLogFiles(), 7);
PutRandomData(0, 100, 1000);
WaitForFlush(0);
AssertNumberOfImmutableMemtables({0, 1, 2, 3});
ASSERT_EQ(CountLiveLogFiles(), 8);
PutRandomData(2, 100, 10000);
WaitForFlush(2);
AssertNumberOfImmutableMemtables({0, 1, 0, 3});
ASSERT_EQ(CountLiveLogFiles(), 9);
PutRandomData(3, 90, 1000);
WaitForFlush(3);
AssertNumberOfImmutableMemtables({0, 1, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 10);
PutRandomData(3, 90, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 0, 1});
ASSERT_EQ(CountLiveLogFiles(), 11);
PutRandomData(1, 200, 1000);
WaitForFlush(1);
AssertNumberOfImmutableMemtables({0, 0, 0, 1});
ASSERT_EQ(CountLiveLogFiles(), 5);
PutRandomData(3, 90*6, 1000);
WaitForFlush(3);
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 12);
PutRandomData(0, 100, 1000);
WaitForFlush(0);
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 12);
PutRandomData(2, 3*100, 10000);
WaitForFlush(2);
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 12);
PutRandomData(1, 2*200, 1000);
WaitForFlush(1);
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 7);
Close();
}
TEST(ColumnFamilyTest, DifferentMergeOperators) {
Open();
CreateColumnFamilies({"first", "second"});
ColumnFamilyOptions default_cf, first, second;
first.merge_operator = MergeOperators::CreateUInt64AddOperator();
second.merge_operator = MergeOperators::CreateStringAppendOperator();
Reopen({default_cf, first, second});
std::string one, two, three;
PutFixed64(&one, 1);
PutFixed64(&two, 2);
PutFixed64(&three, 3);
ASSERT_OK(Put(0, "foo", two));
ASSERT_OK(Put(0, "foo", one));
ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported());
ASSERT_EQ(Get(0, "foo"), one);
ASSERT_OK(Put(1, "foo", two));
ASSERT_OK(Put(1, "foo", one));
ASSERT_OK(Merge(1, "foo", two));
ASSERT_EQ(Get(1, "foo"), three);
ASSERT_OK(Put(2, "foo", two));
ASSERT_OK(Put(2, "foo", one));
ASSERT_OK(Merge(2, "foo", two));
ASSERT_EQ(Get(2, "foo"), one + "," + two);
Close();
}
TEST(ColumnFamilyTest, DifferentCompactionStyles) {
Open();
CreateColumnFamilies({"one", "two"});
ColumnFamilyOptions default_cf, one, two;
db_options_.max_open_files = 20; // only 10 files in file cache
db_options_.disableDataSync = true;
default_cf.compaction_style = kCompactionStyleLevel;
default_cf.num_levels = 3;
default_cf.write_buffer_size = 64 << 10; // 64KB
default_cf.target_file_size_base = 30 << 10;
default_cf.filter_policy = nullptr;
default_cf.no_block_cache = true;
default_cf.source_compaction_factor = 100;
default_cf.disable_seek_compaction = false;
one.compaction_style = kCompactionStyleUniversal;
// trigger compaction if there are >= 4 files
one.level0_file_num_compaction_trigger = 4;
one.write_buffer_size = 100000;
two.compaction_style = kCompactionStyleLevel;
two.num_levels = 4;
two.max_mem_compaction_level = 0;
two.level0_file_num_compaction_trigger = 3;
two.write_buffer_size = 100000;
Reopen({default_cf, one, two});
// SETUP column family "default" - test read compaction
ASSERT_EQ("", FilesPerLevel(0));
PutRandomData(0, 1, 4096);
ASSERT_OK(Flush(0));
ASSERT_EQ("0,0,1", FilesPerLevel(0));
// write 8MB
PutRandomData(0, 2000, 4096);
ASSERT_OK(Flush(0));
// clear levels 0 and 1
dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[0]);
dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[0]);
ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);
// write some new keys into level 0 and 1
PutRandomData(0, 1024, 512);
ASSERT_OK(Flush(0));
WaitForCompaction();
PutRandomData(0, 10, 512);
ASSERT_OK(Flush(0));
// remember number of files in each level
int l1 = NumTableFilesAtLevel(0, 0);
int l2 = NumTableFilesAtLevel(1, 0);
int l3 = NumTableFilesAtLevel(2, 0);
ASSERT_NE(l1, 0);
ASSERT_NE(l2, 0);
ASSERT_NE(l3, 0);
// SETUP column family "one" -- universal style
for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
PutRandomData(1, 11, 10000);
WaitForFlush(1);
ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1));
}
// SETUP column family "two" -- level style with 4 levels
for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
PutRandomData(2, 15, 10000);
WaitForFlush(2);
ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2));
}
// TRIGGER compaction "default"
// read a bunch of times, trigger read compaction
for (int i = 0; i < 200000; ++i) {
Get(0, std::to_string(i));
}
// TRIGGER compaction "one"
PutRandomData(1, 12, 10000);
// TRIGGER compaction "two"
PutRandomData(2, 10, 10000);
// WAIT for compactions
WaitForCompaction();
// VERIFY compaction "default"
// verify that the number of files have decreased
// in some level, indicating that there was a compaction
ASSERT_TRUE(NumTableFilesAtLevel(0, 0) < l1 ||
NumTableFilesAtLevel(1, 0) < l2 ||
NumTableFilesAtLevel(2, 0) < l3);
// VERIFY compaction "one"
ASSERT_EQ("1", FilesPerLevel(1));
// VERIFY compaction "two"
ASSERT_EQ("0,1", FilesPerLevel(2));
CompactAll(2);
ASSERT_EQ("0,1", FilesPerLevel(2));
Close();
}
namespace {
std::string IterStatus(Iterator* iter) {
std::string result;
if (iter->Valid()) {
result = iter->key().ToString() + "->" + iter->value().ToString();
} else {
result = "(invalid)";
}
return result;
}
} // anonymous namespace
TEST(ColumnFamilyTest, NewIteratorsTest) {
// iter == 0 -- no tailing
// iter == 2 -- tailing
for (int iter = 0; iter < 2; ++iter) {
Open();
CreateColumnFamiliesAndReopen({"one", "two"});
ASSERT_OK(Put(0, "a", "b"));
ASSERT_OK(Put(1, "b", "a"));
ASSERT_OK(Put(2, "c", "m"));
ASSERT_OK(Put(2, "v", "t"));
std::vector<Iterator*> iterators;
ReadOptions options;
options.tailing = (iter == 1);
ASSERT_OK(db_->NewIterators(options, handles_, &iterators));
for (auto it : iterators) {
it->SeekToFirst();
}
ASSERT_EQ(IterStatus(iterators[0]), "a->b");
ASSERT_EQ(IterStatus(iterators[1]), "b->a");
ASSERT_EQ(IterStatus(iterators[2]), "c->m");
ASSERT_OK(Put(1, "x", "x"));
for (auto it : iterators) {
it->Next();
}
ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
if (iter == 0) {
// no tailing
ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
} else {
// tailing
ASSERT_EQ(IterStatus(iterators[1]), "x->x");
}
ASSERT_EQ(IterStatus(iterators[2]), "v->t");
for (auto it : iterators) {
delete it;
}
Destroy();
}
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

@ -8,6 +8,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/compaction.h" #include "db/compaction.h"
#include "db/column_family.h"
namespace rocksdb { namespace rocksdb {
@ -29,6 +30,7 @@ Compaction::Compaction(Version* input_version, int level, int out_level,
max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes), max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
input_version_(input_version), input_version_(input_version),
number_levels_(input_version_->NumberLevels()), number_levels_(input_version_->NumberLevels()),
cfd_(input_version_->cfd_),
seek_compaction_(seek_compaction), seek_compaction_(seek_compaction),
enable_compression_(enable_compression), enable_compression_(enable_compression),
grandparent_index_(0), grandparent_index_(0),
@ -42,8 +44,10 @@ Compaction::Compaction(Version* input_version, int level, int out_level,
is_manual_compaction_(false), is_manual_compaction_(false),
level_ptrs_(std::vector<size_t>(number_levels_)) { level_ptrs_(std::vector<size_t>(number_levels_)) {
cfd_->Ref();
input_version_->Ref(); input_version_->Ref();
edit_ = new VersionEdit(); edit_ = new VersionEdit();
edit_->SetColumnFamily(cfd_->GetID());
for (int i = 0; i < number_levels_; i++) { for (int i = 0; i < number_levels_; i++) {
level_ptrs_[i] = 0; level_ptrs_[i] = 0;
} }
@ -54,6 +58,11 @@ Compaction::~Compaction() {
if (input_version_ != nullptr) { if (input_version_ != nullptr) {
input_version_->Unref(); input_version_->Unref();
} }
if (cfd_ != nullptr) {
if (cfd_->Unref()) {
delete cfd_;
}
}
} }
bool Compaction::IsTrivialMove() const { bool Compaction::IsTrivialMove() const {
@ -77,12 +86,11 @@ void Compaction::AddInputDeletions(VersionEdit* edit) {
} }
bool Compaction::IsBaseLevelForKey(const Slice& user_key) { bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
if (input_version_->vset_->options_->compaction_style == if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
kCompactionStyleUniversal) {
return bottommost_level_; return bottommost_level_;
} }
// Maybe use binary search to find right entry instead of linear search? // Maybe use binary search to find right entry instead of linear search?
const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); const Comparator* user_cmp = cfd_->user_comparator();
for (int lvl = level_ + 2; lvl < number_levels_; lvl++) { for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
const std::vector<FileMetaData*>& files = input_version_->files_[lvl]; const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
for (; level_ptrs_[lvl] < files.size(); ) { for (; level_ptrs_[lvl] < files.size(); ) {
@ -103,7 +111,7 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
bool Compaction::ShouldStopBefore(const Slice& internal_key) { bool Compaction::ShouldStopBefore(const Slice& internal_key) {
// Scan to find earliest grandparent file that contains key. // Scan to find earliest grandparent file that contains key.
const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; const InternalKeyComparator* icmp = &cfd_->internal_comparator();
while (grandparent_index_ < grandparents_.size() && while (grandparent_index_ < grandparents_.size() &&
icmp->Compare(internal_key, icmp->Compare(internal_key,
grandparents_[grandparent_index_]->largest.Encode()) > 0) { grandparents_[grandparent_index_]->largest.Encode()) > 0) {
@ -141,8 +149,7 @@ void Compaction::MarkFilesBeingCompacted(bool value) {
// Is this compaction producing files at the bottommost level? // Is this compaction producing files at the bottommost level?
void Compaction::SetupBottomMostLevel(bool isManual) { void Compaction::SetupBottomMostLevel(bool isManual) {
if (input_version_->vset_->options_->compaction_style == if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
kCompactionStyleUniversal) {
// If universal compaction style is used and manual // If universal compaction style is used and manual
// compaction is occuring, then we are guaranteed that // compaction is occuring, then we are guaranteed that
// all files will be picked in a single compaction // all files will be picked in a single compaction
@ -155,8 +162,7 @@ void Compaction::SetupBottomMostLevel(bool isManual) {
return; return;
} }
bottommost_level_ = true; bottommost_level_ = true;
int num_levels = input_version_->vset_->NumberLevels(); for (int i = output_level() + 1; i < number_levels_; i++) {
for (int i = output_level() + 1; i < num_levels; i++) {
if (input_version_->NumLevelFiles(i) > 0) { if (input_version_->NumLevelFiles(i) > 0) {
bottommost_level_ = false; bottommost_level_ = false;
break; break;
@ -169,6 +175,16 @@ void Compaction::ReleaseInputs() {
input_version_->Unref(); input_version_->Unref();
input_version_ = nullptr; input_version_ = nullptr;
} }
if (cfd_ != nullptr) {
if (cfd_->Unref()) {
delete cfd_;
}
cfd_ = nullptr;
}
}
void Compaction::ReleaseCompactionFiles(Status status) {
cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
} }
void Compaction::ResetNextCompactionIndex() { void Compaction::ResetNextCompactionIndex() {

@ -13,6 +13,7 @@
namespace rocksdb { namespace rocksdb {
class Version; class Version;
class ColumnFamilyData;
// A Compaction encapsulates information about a compaction. // A Compaction encapsulates information about a compaction.
class Compaction { class Compaction {
@ -36,6 +37,8 @@ class Compaction {
// Returns input version of the compaction // Returns input version of the compaction
Version* input_version() const { return input_version_; } Version* input_version() const { return input_version_; }
ColumnFamilyData* column_family_data() const { return cfd_; }
// Return the ith input file at "level()+which" ("which" must be 0 or 1). // Return the ith input file at "level()+which" ("which" must be 0 or 1).
FileMetaData* input(int which, int i) const { return inputs_[which][i]; } FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
@ -67,6 +70,10 @@ class Compaction {
// is successful. // is successful.
void ReleaseInputs(); void ReleaseInputs();
// Clear all files to indicate that they are not being compacted
// Delete this compaction from the list of running compactions.
void ReleaseCompactionFiles(Status status);
void Summary(char* output, int len); void Summary(char* output, int len);
// Return the score that was used to pick this compaction run. // Return the score that was used to pick this compaction run.
@ -97,6 +104,7 @@ class Compaction {
Version* input_version_; Version* input_version_;
VersionEdit* edit_; VersionEdit* edit_;
int number_levels_; int number_levels_;
ColumnFamilyData* cfd_;
bool seek_compaction_; bool seek_compaction_;
bool enable_compression_; bool enable_compression_;

@ -277,14 +277,10 @@ void CompactionPicker::SetupOtherInputs(Compaction* c) {
Log(options_->info_log, Log(options_->info_log,
"Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)" "Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)"
"\n", "\n",
(unsigned long)level, (unsigned long)level, (unsigned long)(c->inputs_[0].size()),
(unsigned long)(c->inputs_[0].size()), (unsigned long)(c->inputs_[1].size()), (unsigned long)inputs0_size,
(unsigned long)(c->inputs_[1].size()), (unsigned long)inputs1_size, (unsigned long)(expanded0.size()),
(unsigned long)inputs0_size, (unsigned long)(expanded1.size()), (unsigned long)expanded0_size,
(unsigned long)inputs1_size,
(unsigned long)(expanded0.size()),
(unsigned long)(expanded1.size()),
(unsigned long)expanded0_size,
(unsigned long)inputs1_size); (unsigned long)inputs1_size);
smallest = new_start; smallest = new_start;
largest = new_limit; largest = new_limit;
@ -587,7 +583,7 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
options_->level0_file_num_compaction_trigger; options_->level0_file_num_compaction_trigger;
if ((c = PickCompactionUniversalReadAmp( if ((c = PickCompactionUniversalReadAmp(
version, score, UINT_MAX, num_files, log_buffer)) != nullptr) { version, score, UINT_MAX, num_files, log_buffer)) != nullptr) {
Log(options_->info_log, "Universal: compacting for file num\n"); LogToBuffer(log_buffer, "Universal: compacting for file num\n");
} }
} }
} }
@ -653,7 +649,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
FileMetaData* f = nullptr; FileMetaData* f = nullptr;
bool done = false; bool done = false;
int start_index = 0; int start_index = 0;
unsigned int candidate_count; unsigned int candidate_count = 0;
assert(file_by_time.size() == version->files_[level].size()); assert(file_by_time.size() == version->files_[level].size());
unsigned int max_files_to_compact = std::min(max_merge_width, unsigned int max_files_to_compact = std::min(max_merge_width,

@ -12,6 +12,7 @@
#include "db/compaction.h" #include "db/compaction.h"
#include "rocksdb/status.h" #include "rocksdb/status.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "rocksdb/env.h"
#include <vector> #include <vector>
#include <memory> #include <memory>
@ -118,6 +119,7 @@ class CompactionPicker {
std::unique_ptr<uint64_t[]> level_max_bytes_; std::unique_ptr<uint64_t[]> level_max_bytes_;
const Options* const options_; const Options* const options_;
private: private:
int num_levels_; int num_levels_;

@ -42,7 +42,6 @@
DEFINE_string(benchmarks, DEFINE_string(benchmarks,
"fillseq," "fillseq,"
"fillsync," "fillsync,"
"fillrandom," "fillrandom,"
@ -53,6 +52,7 @@ DEFINE_string(benchmarks,
"readreverse," "readreverse,"
"compact," "compact,"
"readrandom," "readrandom,"
"multireadrandom,"
"readseq," "readseq,"
"readtocache," "readtocache,"
"readreverse," "readreverse,"
@ -64,8 +64,7 @@ DEFINE_string(benchmarks,
"crc32c," "crc32c,"
"compress," "compress,"
"uncompress," "uncompress,"
"acquireload," "acquireload,",
"fillfromstdin,",
"Comma-separated list of operations to run in the specified order" "Comma-separated list of operations to run in the specified order"
"Actual benchmarks:\n" "Actual benchmarks:\n"
@ -129,16 +128,8 @@ DEFINE_int64(merge_keys, -1,
DEFINE_int64(reads, -1, "Number of read operations to do. " DEFINE_int64(reads, -1, "Number of read operations to do. "
"If negative, do FLAGS_num reads."); "If negative, do FLAGS_num reads.");
DEFINE_int64(read_range, 1, "When ==1 reads use ::Get, when >1 reads use"
" an iterator");
DEFINE_bool(use_prefix_blooms, false, "Whether to place prefixes in blooms");
DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality"); DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");
DEFINE_bool(use_prefix_api, false, "Whether to set ReadOptions.prefix for"
" prefixscanrandom. If true, use_prefix_blooms must also be true.");
DEFINE_int64(seed, 0, "Seed base for random number generators. " DEFINE_int64(seed, 0, "Seed base for random number generators. "
"When 0 it is deterministic."); "When 0 it is deterministic.");
@ -278,12 +269,6 @@ DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL"); DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
DEFINE_bool(use_snapshot, false, "If true, create a snapshot per query when"
" randomread benchmark is used");
DEFINE_bool(get_approx, false, "If true, call GetApproximateSizes per query"
" when read_range is > 1 and randomread benchmark is used");
DEFINE_int32(num_levels, 7, "The total number of levels"); DEFINE_int32(num_levels, 7, "The total number of levels");
DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1"); DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1");
@ -461,20 +446,9 @@ DEFINE_string(compaction_fadvice, "NORMAL",
static auto FLAGS_compaction_fadvice_e = static auto FLAGS_compaction_fadvice_e =
rocksdb::Options().access_hint_on_compaction_start; rocksdb::Options().access_hint_on_compaction_start;
DEFINE_bool(use_multiget, false,
"Use multiget to access a series of keys instead of get");
DEFINE_bool(use_tailing_iterator, false, DEFINE_bool(use_tailing_iterator, false,
"Use tailing iterator to access a series of keys instead of get"); "Use tailing iterator to access a series of keys instead of get");
DEFINE_int64(keys_per_multiget, 90, "If use_multiget is true, determines number"
" of keys to group per call Arbitrary default is good because it"
" agrees with readwritepercent");
// TODO: Apply this flag to generic Get calls too. Currently only with Multiget
DEFINE_bool(warn_missing_keys, true, "Print a message to user when a key is"
" missing in a Get/MultiGet call");
DEFINE_bool(use_adaptive_mutex, rocksdb::Options().use_adaptive_mutex, DEFINE_bool(use_adaptive_mutex, rocksdb::Options().use_adaptive_mutex,
"Use adaptive mutex"); "Use adaptive mutex");
@ -798,7 +772,7 @@ class Duration {
start_at_ = FLAGS_env->NowMicros(); start_at_ = FLAGS_env->NowMicros();
} }
bool Done(int increment) { bool Done(int64_t increment) {
if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops
ops_ += increment; ops_ += increment;
@ -834,13 +808,12 @@ class Benchmark {
int key_size_; int key_size_;
int prefix_size_; int prefix_size_;
int64_t keys_per_prefix_; int64_t keys_per_prefix_;
int entries_per_batch_; int64_t entries_per_batch_;
WriteOptions write_options_; WriteOptions write_options_;
int64_t reads_; int64_t reads_;
int64_t writes_; int64_t writes_;
int64_t readwrites_; int64_t readwrites_;
int64_t merge_keys_; int64_t merge_keys_;
int heap_counter_;
void PrintHeader() { void PrintHeader() {
PrintEnvironment(); PrintEnvironment();
fprintf(stdout, "Keys: %d bytes each\n", FLAGS_key_size); fprintf(stdout, "Keys: %d bytes each\n", FLAGS_key_size);
@ -1037,8 +1010,7 @@ class Benchmark {
readwrites_((FLAGS_writes < 0 && FLAGS_reads < 0)? FLAGS_num : readwrites_((FLAGS_writes < 0 && FLAGS_reads < 0)? FLAGS_num :
((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads) ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)
), ),
merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys), merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys) {
heap_counter_(0) {
if (FLAGS_prefix_size > FLAGS_key_size) { if (FLAGS_prefix_size > FLAGS_key_size) {
fprintf(stderr, "prefix size is larger than key size"); fprintf(stderr, "prefix size is larger than key size");
exit(1); exit(1);
@ -1062,6 +1034,10 @@ class Benchmark {
delete prefix_extractor_; delete prefix_extractor_;
} }
Slice AllocateKey() {
return Slice(new char[key_size_], key_size_);
}
// Generate key according to the given specification and random number. // Generate key according to the given specification and random number.
// The resulting key will have the following format (if keys_per_prefix_ // The resulting key will have the following format (if keys_per_prefix_
// is positive), extra trailing bytes are either cut off or paddd with '0'. // is positive), extra trailing bytes are either cut off or paddd with '0'.
@ -1074,10 +1050,8 @@ class Benchmark {
// ---------------------------- // ----------------------------
// | key 00000 | // | key 00000 |
// ---------------------------- // ----------------------------
std::string GenerateKeyFromInt(uint64_t v, int64_t num_keys) { void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
std::string key; char* start = const_cast<char*>(key->data());
key.resize(key_size_);
char* start = &(key[0]);
char* pos = start; char* pos = start;
if (keys_per_prefix_ > 0) { if (keys_per_prefix_ > 0) {
int64_t num_prefix = num_keys / keys_per_prefix_; int64_t num_prefix = num_keys / keys_per_prefix_;
@ -1109,8 +1083,6 @@ class Benchmark {
if (key_size_ > pos - start) { if (key_size_ > pos - start) {
memset(pos, '0', key_size_ - (pos - start)); memset(pos, '0', key_size_ - (pos - start));
} }
return key;
} }
void Run() { void Run() {
@ -1155,15 +1127,12 @@ class Benchmark {
} else if (name == Slice("fillrandom")) { } else if (name == Slice("fillrandom")) {
fresh_db = true; fresh_db = true;
method = &Benchmark::WriteRandom; method = &Benchmark::WriteRandom;
} else if (name == Slice("fillfromstdin")) {
fresh_db = true;
method = &Benchmark::WriteFromStdin;
} else if (name == Slice("filluniquerandom")) { } else if (name == Slice("filluniquerandom")) {
fresh_db = true; fresh_db = true;
if (num_threads > 1) { if (num_threads > 1) {
fprintf(stderr, "filluniquerandom multithreaded not supported" fprintf(stderr, "filluniquerandom multithreaded not supported"
" set --threads=1"); ", use 1 thread");
exit(1); num_threads = 1;
} }
method = &Benchmark::WriteUniqueRandom; method = &Benchmark::WriteUniqueRandom;
} else if (name == Slice("overwrite")) { } else if (name == Slice("overwrite")) {
@ -1189,19 +1158,18 @@ class Benchmark {
method = &Benchmark::ReadReverse; method = &Benchmark::ReadReverse;
} else if (name == Slice("readrandom")) { } else if (name == Slice("readrandom")) {
method = &Benchmark::ReadRandom; method = &Benchmark::ReadRandom;
} else if (name == Slice("multireadrandom")) {
method = &Benchmark::MultiReadRandom;
} else if (name == Slice("readmissing")) { } else if (name == Slice("readmissing")) {
method = &Benchmark::ReadMissing; ++key_size_;
method = &Benchmark::ReadRandom;
} else if (name == Slice("newiterator")) { } else if (name == Slice("newiterator")) {
method = &Benchmark::IteratorCreation; method = &Benchmark::IteratorCreation;
} else if (name == Slice("seekrandom")) { } else if (name == Slice("seekrandom")) {
method = &Benchmark::SeekRandom; method = &Benchmark::SeekRandom;
} else if (name == Slice("readhot")) {
method = &Benchmark::ReadHot;
} else if (name == Slice("readrandomsmall")) { } else if (name == Slice("readrandomsmall")) {
reads_ /= 1000; reads_ /= 1000;
method = &Benchmark::ReadRandom; method = &Benchmark::ReadRandom;
} else if (name == Slice("prefixscanrandom")) {
method = &Benchmark::PrefixScanRandom;
} else if (name == Slice("deleteseq")) { } else if (name == Slice("deleteseq")) {
method = &Benchmark::DeleteSeq; method = &Benchmark::DeleteSeq;
} else if (name == Slice("deleterandom")) { } else if (name == Slice("deleterandom")) {
@ -1215,10 +1183,9 @@ class Benchmark {
if (FLAGS_merge_operator.empty()) { if (FLAGS_merge_operator.empty()) {
fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
name.ToString().c_str()); name.ToString().c_str());
method = nullptr; exit(1);
} else {
method = &Benchmark::ReadRandomMergeRandom;
} }
method = &Benchmark::ReadRandomMergeRandom;
} else if (name == Slice("updaterandom")) { } else if (name == Slice("updaterandom")) {
method = &Benchmark::UpdateRandom; method = &Benchmark::UpdateRandom;
} else if (name == Slice("appendrandom")) { } else if (name == Slice("appendrandom")) {
@ -1227,10 +1194,9 @@ class Benchmark {
if (FLAGS_merge_operator.empty()) { if (FLAGS_merge_operator.empty()) {
fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
name.ToString().c_str()); name.ToString().c_str());
method = nullptr; exit(1);
} else {
method = &Benchmark::MergeRandom;
} }
method = &Benchmark::MergeRandom;
} else if (name == Slice("randomwithverify")) { } else if (name == Slice("randomwithverify")) {
method = &Benchmark::RandomWithVerify; method = &Benchmark::RandomWithVerify;
} else if (name == Slice("compact")) { } else if (name == Slice("compact")) {
@ -1243,8 +1209,6 @@ class Benchmark {
method = &Benchmark::Compress; method = &Benchmark::Compress;
} else if (name == Slice("uncompress")) { } else if (name == Slice("uncompress")) {
method = &Benchmark::Uncompress; method = &Benchmark::Uncompress;
} else if (name == Slice("heapprofile")) {
HeapProfile();
} else if (name == Slice("stats")) { } else if (name == Slice("stats")) {
PrintStats("rocksdb.stats"); PrintStats("rocksdb.stats");
} else if (name == Slice("levelstats")) { } else if (name == Slice("levelstats")) {
@ -1254,6 +1218,7 @@ class Benchmark {
} else { } else {
if (name != Slice()) { // No error message for empty name if (name != Slice()) { // No error message for empty name
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
exit(1);
} }
} }
@ -1540,7 +1505,7 @@ class Benchmark {
options.compaction_style = FLAGS_compaction_style_e; options.compaction_style = FLAGS_compaction_style_e;
options.block_size = FLAGS_block_size; options.block_size = FLAGS_block_size;
options.filter_policy = filter_policy_; options.filter_policy = filter_policy_;
if (FLAGS_use_plain_table || FLAGS_use_prefix_blooms) { if (FLAGS_use_plain_table) {
options.prefix_extractor.reset( options.prefix_extractor.reset(
NewFixedPrefixTransform(FLAGS_prefix_size)); NewFixedPrefixTransform(FLAGS_prefix_size));
} }
@ -1715,54 +1680,6 @@ class Benchmark {
DoWrite(thread, UNIQUE_RANDOM); DoWrite(thread, UNIQUE_RANDOM);
} }
void writeOrFail(WriteBatch& batch) {
Status s = db_->Write(write_options_, &batch);
if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1);
}
}
void WriteFromStdin(ThreadState* thread) {
size_t count = 0;
WriteBatch batch;
const size_t bufferLen = 32 << 20;
unique_ptr<char[]> line = unique_ptr<char[]>(new char[bufferLen]);
char* linep = line.get();
const int batchSize = 100 << 10;
const char columnSeparator = '\t';
const char lineSeparator = '\n';
while (fgets(linep, bufferLen, stdin) != nullptr) {
++count;
char* tab = std::find(linep, linep + bufferLen, columnSeparator);
if (tab == linep + bufferLen) {
fprintf(stderr, "[Error] No Key delimiter TAB at line %zu\n", count);
continue;
}
Slice key(linep, tab - linep);
tab++;
char* endLine = std::find(tab, linep + bufferLen, lineSeparator);
if (endLine == linep + bufferLen) {
fprintf(stderr, "[Error] No ENTER at end of line # %zu\n", count);
continue;
}
Slice value(tab, endLine - tab);
thread->stats.FinishedSingleOp(db_);
thread->stats.AddBytes(endLine - linep - 1);
if (batch.Count() < batchSize) {
batch.Put(key, value);
continue;
}
writeOrFail(batch);
batch.Clear();
}
if (batch.Count() > 0) {
writeOrFail(batch);
}
}
void DoWrite(ThreadState* thread, WriteMode write_mode) { void DoWrite(ThreadState* thread, WriteMode write_mode) {
const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0; const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
const int64_t num_ops = writes_ == 0 ? num_ : writes_; const int64_t num_ops = writes_ == 0 ? num_ : writes_;
@ -1783,10 +1700,13 @@ class Benchmark {
WriteBatch batch; WriteBatch batch;
Status s; Status s;
int64_t bytes = 0; int64_t bytes = 0;
int i = 0; int64_t i = 0;
Slice key = AllocateKey();
std::unique_ptr<const char[]> key_guard(key.data());
while (!duration.Done(entries_per_batch_)) { while (!duration.Done(entries_per_batch_)) {
batch.Clear(); batch.Clear();
for (int j = 0; j < entries_per_batch_; j++) { for (int64_t j = 0; j < entries_per_batch_; j++) {
int64_t k = 0; int64_t k = 0;
switch(write_mode) { switch(write_mode) {
case SEQUENTIAL: case SEQUENTIAL:
@ -1825,9 +1745,9 @@ class Benchmark {
break; break;
} }
}; };
std::string key = GenerateKeyFromInt(k, FLAGS_num); GenerateKeyFromInt(k, FLAGS_num, &key);
batch.Put(key, gen.Generate(value_size_)); batch.Put(key, gen.Generate(value_size_));
bytes += value_size_ + key.size(); bytes += value_size_ + key_size_;
thread->stats.FinishedSingleOp(db_); thread->stats.FinishedSingleOp(db_);
} }
s = db_->Write(write_options_, &batch); s = db_->Write(write_options_, &batch);
@ -1866,135 +1786,22 @@ class Benchmark {
thread->stats.AddBytes(bytes); thread->stats.AddBytes(bytes);
} }
// Calls MultiGet over a list of keys from a random distribution.
// Returns the total number of keys found.
long MultiGetRandom(ReadOptions& options, int num_keys,
Random64* rand, int64_t range, const char* suffix) {
assert(num_keys > 0);
std::vector<Slice> keys(num_keys);
std::vector<std::string> values(num_keys);
std::vector<std::string> gen_keys(num_keys);
int i;
int64_t k;
// Fill the keys vector
for(i=0; i<num_keys; ++i) {
k = rand->Next() % range;
gen_keys[i] = GenerateKeyFromInt(k, range) + suffix;
keys[i] = gen_keys[i];
}
if (FLAGS_use_snapshot) {
options.snapshot = db_->GetSnapshot();
}
// Apply the operation
std::vector<Status> statuses = db_->MultiGet(options, keys, &values);
assert((long)statuses.size() == num_keys);
assert((long)keys.size() == num_keys); // Should always be the case.
assert((long)values.size() == num_keys);
if (FLAGS_use_snapshot) {
db_->ReleaseSnapshot(options.snapshot);
options.snapshot = nullptr;
}
// Count number found
long found = 0;
for(i=0; i<num_keys; ++i) {
if (statuses[i].ok()){
++found;
} else if (FLAGS_warn_missing_keys == true) {
// Key not found, or error.
fprintf(stderr, "get error: %s\n", statuses[i].ToString().c_str());
}
}
return found;
}
void ReadRandom(ThreadState* thread) { void ReadRandom(ThreadState* thread) {
ReadOptions options(FLAGS_verify_checksum, true);
Duration duration(FLAGS_duration, reads_);
int64_t found = 0;
int64_t read = 0; int64_t read = 0;
if (FLAGS_use_multiget) { // MultiGet int64_t found = 0;
const long& kpg = FLAGS_keys_per_multiget; // keys per multiget group ReadOptions options(FLAGS_verify_checksum, true);
long keys_left = reads_; Slice key = AllocateKey();
std::unique_ptr<const char[]> key_guard(key.data());
// Recalculate number of keys per group, and call MultiGet until done std::string value;
long num_keys;
while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) {
read += num_keys;
found +=
MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, "");
thread->stats.FinishedSingleOp(db_);
keys_left -= num_keys;
}
} else if (FLAGS_use_tailing_iterator) { // use tailing iterator for gets
options.tailing = true;
Iterator* iter = db_->NewIterator(options);
while (!duration.Done(1)) {
const int64_t k = thread->rand.Next() % FLAGS_num;
std::string key = GenerateKeyFromInt(k, FLAGS_num);
iter->Seek(key);
read++;
if (iter->Valid() && iter->key().compare(Slice(key)) == 0) {
found++;
}
thread->stats.FinishedSingleOp(db_);
}
delete iter;
} else { // Regular case. Do one "get" at a time Get
options.tailing = true;
options.prefix_seek = (FLAGS_prefix_size == 0);
Iterator* iter = db_->NewIterator(options);
std::string value;
while (!duration.Done(1)) {
const int64_t k = thread->rand.Next() % FLAGS_num;
std::string key = GenerateKeyFromInt(k, FLAGS_num);
if (FLAGS_use_snapshot) {
options.snapshot = db_->GetSnapshot();
}
if (FLAGS_read_range < 2) {
read++;
if (db_->Get(options, key, &value).ok()) {
found++;
}
} else {
int count = 1;
if (FLAGS_get_approx) {
std::string key2 =
GenerateKeyFromInt(k + static_cast<int>(FLAGS_read_range),
FLAGS_num + FLAGS_read_range);
Range range(key, key2);
uint64_t sizes;
db_->GetApproximateSizes(&range, 1, &sizes);
}
read += FLAGS_read_range;
for (iter->Seek(key);
iter->Valid() && count <= FLAGS_read_range;
++count, iter->Next()) {
found++;
}
}
if (FLAGS_use_snapshot) {
db_->ReleaseSnapshot(options.snapshot);
options.snapshot = nullptr;
}
thread->stats.FinishedSingleOp(db_); Duration duration(FLAGS_duration, reads_);
while (!duration.Done(1)) {
GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
read++;
if (db_->Get(options, key, &value).ok()) {
found++;
} }
thread->stats.FinishedSingleOp(db_);
delete iter;
} }
char msg[100]; char msg[100];
@ -2008,113 +1815,41 @@ class Benchmark {
} }
} }
void PrefixScanRandom(ThreadState* thread) { // Calls MultiGet over a list of keys from a random distribution.
if (FLAGS_use_prefix_api) { // Returns the total number of keys found.
assert(FLAGS_use_prefix_blooms); void MultiReadRandom(ThreadState* thread) {
assert(FLAGS_bloom_bits >= 1); int64_t read = 0;
}
ReadOptions options(FLAGS_verify_checksum, true);
Duration duration(FLAGS_duration, reads_);
int64_t found = 0; int64_t found = 0;
while (!duration.Done(1)) {
std::string value;
const int k = thread->rand.Next() % FLAGS_num;
std::string key = GenerateKeyFromInt(k, FLAGS_num);
Slice skey(key);
Slice prefix = prefix_extractor_->Transform(skey);
options.prefix = FLAGS_use_prefix_api ? &prefix : nullptr;
Iterator* iter = db_->NewIterator(options);
for (iter->Seek(skey);
iter->Valid() && iter->key().starts_with(prefix);
iter->Next()) {
found++;
}
delete iter;
thread->stats.FinishedSingleOp(db_);
}
char msg[100];
snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
found, reads_);
thread->stats.AddMessage(msg);
}
void ReadMissing(ThreadState* thread) {
FLAGS_warn_missing_keys = false; // Never warn about missing keys
Duration duration(FLAGS_duration, reads_);
ReadOptions options(FLAGS_verify_checksum, true); ReadOptions options(FLAGS_verify_checksum, true);
std::vector<Slice> keys(entries_per_batch_);
if (FLAGS_use_multiget) { std::vector<std::string> values(entries_per_batch_);
const long& kpg = FLAGS_keys_per_multiget; // keys per multiget group while (keys.size() < entries_per_batch_) {
long keys_left = reads_; keys.push_back(AllocateKey());
// Recalculate number of keys per group, and call MultiGet until done
long num_keys;
long found;
while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) {
found =
MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, ".");
// We should not find any key since the key we try to get has a
// different suffix
if (found) {
assert(false);
}
thread->stats.FinishedSingleOp(db_);
keys_left -= num_keys;
}
} else { // Regular case (not MultiGet)
std::string value;
Status s;
while (!duration.Done(1)) {
const int64_t k = thread->rand.Next() % FLAGS_num;
std::string key = GenerateKeyFromInt(k, FLAGS_num) + ".";
s = db_->Get(options, key, &value);
assert(!s.ok() && s.IsNotFound());
thread->stats.FinishedSingleOp(db_);
}
} }
}
void ReadHot(ThreadState* thread) {
Duration duration(FLAGS_duration, reads_); Duration duration(FLAGS_duration, reads_);
ReadOptions options(FLAGS_verify_checksum, true); while (!duration.Done(1)) {
const int64_t range = (FLAGS_num + 99) / 100; for (int64_t i = 0; i < entries_per_batch_; ++i) {
int64_t found = 0; GenerateKeyFromInt(thread->rand.Next() % FLAGS_num,
FLAGS_num, &keys[i]);
if (FLAGS_use_multiget) {
const int64_t kpg = FLAGS_keys_per_multiget; // keys per multiget group
int64_t keys_left = reads_;
// Recalculate number of keys per group, and call MultiGet until done
long num_keys;
while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) {
found += MultiGetRandom(options, num_keys, &thread->rand, range, "");
thread->stats.FinishedSingleOp(db_);
keys_left -= num_keys;
} }
} else { std::vector<Status> statuses = db_->MultiGet(options, keys, &values);
std::string value; assert(statuses.size() == entries_per_batch_);
while (!duration.Done(1)) {
const int64_t k = thread->rand.Next() % range; read += entries_per_batch_;
std::string key = GenerateKeyFromInt(k, range); for (int64_t i = 0; i < entries_per_batch_; ++i) {
if (db_->Get(options, key, &value).ok()) { if (statuses[i].ok()) {
++found; ++found;
} }
thread->stats.FinishedSingleOp(db_);
} }
} }
for (auto& k : keys) {
delete k.data();
}
char msg[100]; char msg[100];
snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
found, reads_); found, read);
thread->stats.AddMessage(msg); thread->stats.AddMessage(msg);
} }
@ -2129,44 +1864,53 @@ class Benchmark {
} }
void SeekRandom(ThreadState* thread) { void SeekRandom(ThreadState* thread) {
Duration duration(FLAGS_duration, reads_); int64_t read = 0;
ReadOptions options(FLAGS_verify_checksum, true);
std::string value;
int64_t found = 0; int64_t found = 0;
ReadOptions options(FLAGS_verify_checksum, true);
options.tailing = FLAGS_use_tailing_iterator;
auto* iter = db_->NewIterator(options);
Slice key = AllocateKey();
std::unique_ptr<const char[]> key_guard(key.data());
Duration duration(FLAGS_duration, reads_);
while (!duration.Done(1)) { while (!duration.Done(1)) {
Iterator* iter = db_->NewIterator(options); GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
const int64_t k = thread->rand.Next() % FLAGS_num;
std::string key = GenerateKeyFromInt(k, FLAGS_num);
iter->Seek(key); iter->Seek(key);
if (iter->Valid() && iter->key() == Slice(key)) found++; read++;
delete iter; if (iter->Valid() && iter->key().compare(key) == 0) {
found++;
}
thread->stats.FinishedSingleOp(db_); thread->stats.FinishedSingleOp(db_);
} }
delete iter;
char msg[100]; char msg[100];
snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
found, num_); found, read);
thread->stats.AddMessage(msg); thread->stats.AddMessage(msg);
} }
void DoDelete(ThreadState* thread, bool seq) { void DoDelete(ThreadState* thread, bool seq) {
WriteBatch batch; WriteBatch batch;
Status s;
Duration duration(seq ? 0 : FLAGS_duration, num_); Duration duration(seq ? 0 : FLAGS_duration, num_);
long i = 0; int64_t i = 0;
Slice key = AllocateKey();
std::unique_ptr<const char[]> key_guard(key.data());
while (!duration.Done(entries_per_batch_)) { while (!duration.Done(entries_per_batch_)) {
batch.Clear(); batch.Clear();
for (int j = 0; j < entries_per_batch_; j++) { for (int64_t j = 0; j < entries_per_batch_; ++j) {
const int64_t k = seq ? i+j : (thread->rand.Next() % FLAGS_num); const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
std::string key = GenerateKeyFromInt(k, FLAGS_num); GenerateKeyFromInt(k, FLAGS_num, &key);
batch.Delete(key); batch.Delete(key);
thread->stats.FinishedSingleOp(db_); thread->stats.FinishedSingleOp(db_);
} }
s = db_->Write(write_options_, &batch); auto s = db_->Write(write_options_, &batch);
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "del error: %s\n", s.ToString().c_str()); fprintf(stderr, "del error: %s\n", s.ToString().c_str());
exit(1); exit(1);
} }
++i; i += entries_per_batch_;
} }
} }
@ -2197,6 +1941,9 @@ class Benchmark {
// Don't merge stats from this thread with the readers. // Don't merge stats from this thread with the readers.
thread->stats.SetExcludeFromMerge(); thread->stats.SetExcludeFromMerge();
Slice key = AllocateKey();
std::unique_ptr<const char[]> key_guard(key.data());
while (true) { while (true) {
{ {
MutexLock l(&thread->shared->mu); MutexLock l(&thread->shared->mu);
@ -2206,8 +1953,7 @@ class Benchmark {
} }
} }
const int64_t k = thread->rand.Next() % FLAGS_num; GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
std::string key = GenerateKeyFromInt(k, FLAGS_num);
Status s = db_->Put(write_options_, key, gen.Generate(value_size_)); Status s = db_->Put(write_options_, key, gen.Generate(value_size_));
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str()); fprintf(stderr, "put error: %s\n", s.ToString().c_str());
@ -2235,7 +1981,7 @@ class Benchmark {
// Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V) // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
// in DB atomically i.e in a single batch. Also refer GetMany. // in DB atomically i.e in a single batch. Also refer GetMany.
Status PutMany(const WriteOptions& writeoptions, Status PutMany(const WriteOptions& writeoptions,
const Slice& key, const Slice& value) { const Slice& key, const Slice& value) {
std::string suffixes[3] = {"2", "1", "0"}; std::string suffixes[3] = {"2", "1", "0"};
std::string keys[3]; std::string keys[3];
@ -2273,7 +2019,7 @@ class Benchmark {
// in the same snapshot, and verifies that all the values are identical. // in the same snapshot, and verifies that all the values are identical.
// ASSUMES that PutMany was used to put (K, V) into the DB. // ASSUMES that PutMany was used to put (K, V) into the DB.
Status GetMany(const ReadOptions& readoptions, Status GetMany(const ReadOptions& readoptions,
const Slice& key, std::string* value) { const Slice& key, std::string* value) {
std::string suffixes[3] = {"0", "1", "2"}; std::string suffixes[3] = {"0", "1", "2"};
std::string keys[3]; std::string keys[3];
Slice key_slices[3]; Slice key_slices[3];
@ -2328,16 +2074,19 @@ class Benchmark {
int64_t puts_done = 0; int64_t puts_done = 0;
int64_t deletes_done = 0; int64_t deletes_done = 0;
Slice key = AllocateKey();
std::unique_ptr<const char[]> key_guard(key.data());
// the number of iterations is the larger of read_ or write_ // the number of iterations is the larger of read_ or write_
for (int64_t i = 0; i < readwrites_; i++) { for (int64_t i = 0; i < readwrites_; i++) {
const int64_t k = thread->rand.Next() % (FLAGS_numdistinct);
std::string key = GenerateKeyFromInt(k, FLAGS_numdistinct);
if (get_weight == 0 && put_weight == 0 && delete_weight == 0) { if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
// one batch completed, reinitialize for next batch // one batch completed, reinitialize for next batch
get_weight = FLAGS_readwritepercent; get_weight = FLAGS_readwritepercent;
delete_weight = FLAGS_deletepercent; delete_weight = FLAGS_deletepercent;
put_weight = 100 - get_weight - delete_weight; put_weight = 100 - get_weight - delete_weight;
} }
GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
FLAGS_numdistinct, &key);
if (get_weight > 0) { if (get_weight > 0) {
// do all the gets first // do all the gets first
Status s = GetMany(options, key, &value); Status s = GetMany(options, key, &value);
@ -2383,12 +2132,6 @@ class Benchmark {
// This is different from ReadWhileWriting because it does not use // This is different from ReadWhileWriting because it does not use
// an extra thread. // an extra thread.
void ReadRandomWriteRandom(ThreadState* thread) { void ReadRandomWriteRandom(ThreadState* thread) {
if (FLAGS_use_multiget){
// Separate function for multiget (for ease of reading)
ReadRandomWriteRandomMultiGet(thread);
return;
}
ReadOptions options(FLAGS_verify_checksum, true); ReadOptions options(FLAGS_verify_checksum, true);
RandomGenerator gen; RandomGenerator gen;
std::string value; std::string value;
@ -2399,28 +2142,18 @@ class Benchmark {
int64_t writes_done = 0; int64_t writes_done = 0;
Duration duration(FLAGS_duration, readwrites_); Duration duration(FLAGS_duration, readwrites_);
Slice key = AllocateKey();
std::unique_ptr<const char[]> key_guard(key.data());
// the number of iterations is the larger of read_ or write_ // the number of iterations is the larger of read_ or write_
while (!duration.Done(1)) { while (!duration.Done(1)) {
const int64_t k = thread->rand.Next() % FLAGS_num; GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
std::string key = GenerateKeyFromInt(k, FLAGS_num);
if (get_weight == 0 && put_weight == 0) { if (get_weight == 0 && put_weight == 0) {
// one batch completed, reinitialize for next batch // one batch completed, reinitialize for next batch
get_weight = FLAGS_readwritepercent; get_weight = FLAGS_readwritepercent;
put_weight = 100 - get_weight; put_weight = 100 - get_weight;
} }
if (get_weight > 0) { if (get_weight > 0) {
if (FLAGS_use_snapshot) {
options.snapshot = db_->GetSnapshot();
}
if (FLAGS_get_approx) {
std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1);
Range range(key, key2);
uint64_t sizes;
db_->GetApproximateSizes(&range, 1, &sizes);
}
// do all the gets first // do all the gets first
Status s = db_->Get(options, key, &value); Status s = db_->Get(options, key, &value);
if (!s.ok() && !s.IsNotFound()) { if (!s.ok() && !s.IsNotFound()) {
@ -2430,14 +2163,8 @@ class Benchmark {
} else if (!s.IsNotFound()) { } else if (!s.IsNotFound()) {
found++; found++;
} }
get_weight--; get_weight--;
reads_done++; reads_done++;
if (FLAGS_use_snapshot) {
db_->ReleaseSnapshot(options.snapshot);
}
} else if (put_weight > 0) { } else if (put_weight > 0) {
// then do all the corresponding number of puts // then do all the corresponding number of puts
// for all the gets we have done earlier // for all the gets we have done earlier
@ -2458,82 +2185,6 @@ class Benchmark {
thread->stats.AddMessage(msg); thread->stats.AddMessage(msg);
} }
// ReadRandomWriteRandom (with multiget)
// Does FLAGS_keys_per_multiget reads (per multiget), followed by some puts.
// FLAGS_readwritepercent will specify the ratio of gets to puts.
// e.g.: If FLAGS_keys_per_multiget == 100 and FLAGS_readwritepercent == 75
// Then each block will do 100 multigets and 33 puts
// So there are 133 operations in-total: 100 of them (75%) are gets, and 33
// of them (25%) are puts.
void ReadRandomWriteRandomMultiGet(ThreadState* thread) {
ReadOptions options(FLAGS_verify_checksum, true);
RandomGenerator gen;
// For multiget
const long& kpg = FLAGS_keys_per_multiget; // keys per multiget group
long keys_left = readwrites_; // number of keys still left to read
long num_keys; // number of keys to read in current group
long num_put_keys; // number of keys to put in current group
int64_t found = 0;
int64_t reads_done = 0;
int64_t writes_done = 0;
int64_t multigets_done = 0;
// the number of iterations is the larger of read_ or write_
Duration duration(FLAGS_duration, readwrites_);
while(true) {
// Read num_keys keys, then write num_put_keys keys.
// The ratio of num_keys to num_put_keys is always FLAGS_readwritepercent
// And num_keys is set to be FLAGS_keys_per_multiget (kpg)
// num_put_keys is calculated accordingly (to maintain the ratio)
// Note: On the final iteration, num_keys and num_put_keys will be smaller
num_keys = std::min(keys_left*(FLAGS_readwritepercent + 99)/100, kpg);
num_put_keys = num_keys * (100-FLAGS_readwritepercent)
/ FLAGS_readwritepercent;
// This will break the loop when duration is complete
if (duration.Done(num_keys + num_put_keys)) {
break;
}
// A quick check to make sure our formula doesn't break on edge cases
assert(num_keys >= 1);
assert(num_keys + num_put_keys <= keys_left);
// Apply the MultiGet operations
found += MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, "");
++multigets_done;
reads_done+=num_keys;
thread->stats.FinishedSingleOp(db_);
// Now do the puts
int i;
int64_t k;
for(i=0; i<num_put_keys; ++i) {
k = thread->rand.Next() % FLAGS_num;
std::string key = GenerateKeyFromInt(k, FLAGS_num);
Status s = db_->Put(write_options_, key,
gen.Generate(value_size_));
if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1);
}
writes_done++;
thread->stats.FinishedSingleOp(db_);
}
keys_left -= (num_keys + num_put_keys);
}
char msg[100];
snprintf(msg, sizeof(msg),
"( reads:%" PRIu64 " writes:%" PRIu64 " total:%" PRIu64 \
" multiget_ops:%" PRIu64 " found:%" PRIu64 ")",
reads_done, writes_done, readwrites_, multigets_done, found);
thread->stats.AddMessage(msg);
}
// //
// Read-modify-write for random keys // Read-modify-write for random keys
void UpdateRandom(ThreadState* thread) { void UpdateRandom(ThreadState* thread) {
@ -2543,30 +2194,16 @@ class Benchmark {
int64_t found = 0; int64_t found = 0;
Duration duration(FLAGS_duration, readwrites_); Duration duration(FLAGS_duration, readwrites_);
Slice key = AllocateKey();
std::unique_ptr<const char[]> key_guard(key.data());
// the number of iterations is the larger of read_ or write_ // the number of iterations is the larger of read_ or write_
while (!duration.Done(1)) { while (!duration.Done(1)) {
const int64_t k = thread->rand.Next() % FLAGS_num; GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
std::string key = GenerateKeyFromInt(k, FLAGS_num);
if (FLAGS_use_snapshot) {
options.snapshot = db_->GetSnapshot();
}
if (FLAGS_get_approx) {
std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1);
Range range(key, key2);
uint64_t sizes;
db_->GetApproximateSizes(&range, 1, &sizes);
}
if (db_->Get(options, key, &value).ok()) { if (db_->Get(options, key, &value).ok()) {
found++; found++;
} }
if (FLAGS_use_snapshot) {
db_->ReleaseSnapshot(options.snapshot);
}
Status s = db_->Put(write_options_, key, gen.Generate(value_size_)); Status s = db_->Put(write_options_, key, gen.Generate(value_size_));
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str()); fprintf(stderr, "put error: %s\n", s.ToString().c_str());
@ -2589,22 +2226,12 @@ class Benchmark {
std::string value; std::string value;
int64_t found = 0; int64_t found = 0;
Slice key = AllocateKey();
std::unique_ptr<const char[]> key_guard(key.data());
// The number of iterations is the larger of read_ or write_ // The number of iterations is the larger of read_ or write_
Duration duration(FLAGS_duration, readwrites_); Duration duration(FLAGS_duration, readwrites_);
while (!duration.Done(1)) { while (!duration.Done(1)) {
const int64_t k = thread->rand.Next() % FLAGS_num; GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
std::string key = GenerateKeyFromInt(k, FLAGS_num);
if (FLAGS_use_snapshot) {
options.snapshot = db_->GetSnapshot();
}
if (FLAGS_get_approx) {
std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1);
Range range(key, key2);
uint64_t sizes;
db_->GetApproximateSizes(&range, 1, &sizes);
}
// Get the existing value // Get the existing value
if (db_->Get(options, key, &value).ok()) { if (db_->Get(options, key, &value).ok()) {
@ -2614,10 +2241,6 @@ class Benchmark {
value.clear(); value.clear();
} }
if (FLAGS_use_snapshot) {
db_->ReleaseSnapshot(options.snapshot);
}
// Update the value (by appending data) // Update the value (by appending data)
Slice operand = gen.Generate(value_size_); Slice operand = gen.Generate(value_size_);
if (value.size() > 0) { if (value.size() > 0) {
@ -2634,6 +2257,7 @@ class Benchmark {
} }
thread->stats.FinishedSingleOp(db_); thread->stats.FinishedSingleOp(db_);
} }
char msg[100]; char msg[100];
snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")", snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
readwrites_, found); readwrites_, found);
@ -2653,11 +2277,12 @@ class Benchmark {
void MergeRandom(ThreadState* thread) { void MergeRandom(ThreadState* thread) {
RandomGenerator gen; RandomGenerator gen;
Slice key = AllocateKey();
std::unique_ptr<const char[]> key_guard(key.data());
// The number of iterations is the larger of read_ or write_ // The number of iterations is the larger of read_ or write_
Duration duration(FLAGS_duration, readwrites_); Duration duration(FLAGS_duration, readwrites_);
while (!duration.Done(1)) { while (!duration.Done(1)) {
const int64_t k = thread->rand.Next() % merge_keys_; GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
std::string key = GenerateKeyFromInt(k, merge_keys_);
Status s = db_->Merge(write_options_, key, gen.Generate(value_size_)); Status s = db_->Merge(write_options_, key, gen.Generate(value_size_));
@ -2690,12 +2315,12 @@ class Benchmark {
int64_t num_merges = 0; int64_t num_merges = 0;
size_t max_length = 0; size_t max_length = 0;
Slice key = AllocateKey();
std::unique_ptr<const char[]> key_guard(key.data());
// the number of iterations is the larger of read_ or write_ // the number of iterations is the larger of read_ or write_
Duration duration(FLAGS_duration, readwrites_); Duration duration(FLAGS_duration, readwrites_);
while (!duration.Done(1)) { while (!duration.Done(1)) {
const int64_t k = thread->rand.Next() % merge_keys_; GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
std::string key = GenerateKeyFromInt(k, merge_keys_);
bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent; bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
@ -2727,6 +2352,7 @@ class Benchmark {
thread->stats.FinishedSingleOp(db_); thread->stats.FinishedSingleOp(db_);
} }
char msg[100]; char msg[100];
snprintf(msg, sizeof(msg), snprintf(msg, sizeof(msg),
"(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64 " hits:%" \ "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64 " hits:%" \
@ -2735,7 +2361,6 @@ class Benchmark {
thread->stats.AddMessage(msg); thread->stats.AddMessage(msg);
} }
void Compact(ThreadState* thread) { void Compact(ThreadState* thread) {
db_->CompactRange(nullptr, nullptr); db_->CompactRange(nullptr, nullptr);
} }
@ -2747,28 +2372,6 @@ class Benchmark {
} }
fprintf(stdout, "\n%s\n", stats.c_str()); fprintf(stdout, "\n%s\n", stats.c_str());
} }
static void WriteToFile(void* arg, const char* buf, int n) {
reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n));
}
void HeapProfile() {
char fname[100];
EnvOptions soptions;
snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db.c_str(),
++heap_counter_);
unique_ptr<WritableFile> file;
Status s = FLAGS_env->NewWritableFile(fname, &file, soptions);
if (!s.ok()) {
fprintf(stderr, "%s\n", s.ToString().c_str());
return;
}
bool ok = port::GetHeapProfile(WriteToFile, file.get());
if (!ok) {
fprintf(stderr, "heap profiling not supported\n");
FLAGS_env->DeleteFile(fname);
}
}
}; };
} // namespace rocksdb } // namespace rocksdb

@ -7,6 +7,8 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. // found in the LICENSE file.
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <algorithm> #include <algorithm>
#include <string> #include <string>
#include <stdint.h> #include <stdint.h>
@ -17,6 +19,7 @@
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "port/port.h" #include "port/port.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "util/sync_point.h"
namespace rocksdb { namespace rocksdb {
@ -60,21 +63,36 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
*manifest_file_size = 0; *manifest_file_size = 0;
mutex_.Lock();
if (flush_memtable) { if (flush_memtable) {
// flush all dirty data to disk. // flush all dirty data to disk.
Status status = Flush(FlushOptions()); Status status;
for (auto cfd : *versions_->GetColumnFamilySet()) {
cfd->Ref();
mutex_.Unlock();
status = FlushMemTable(cfd, FlushOptions());
mutex_.Lock();
cfd->Unref();
if (!status.ok()) {
break;
}
}
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
if (!status.ok()) { if (!status.ok()) {
mutex_.Unlock();
Log(options_.info_log, "Cannot Flush data %s\n", Log(options_.info_log, "Cannot Flush data %s\n",
status.ToString().c_str()); status.ToString().c_str());
return status; return status;
} }
} }
MutexLock l(&mutex_);
// Make a set of all of the live *.sst files // Make a set of all of the live *.sst files
std::set<uint64_t> live; std::set<uint64_t> live;
versions_->current()->AddLiveFiles(&live); for (auto cfd : *versions_->GetColumnFamilySet()) {
cfd->current()->AddLiveFiles(&live);
}
ret.clear(); ret.clear();
ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
@ -91,24 +109,60 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
// find length of manifest file while holding the mutex lock // find length of manifest file while holding the mutex lock
*manifest_file_size = versions_->ManifestFileSize(); *manifest_file_size = versions_->ManifestFileSize();
mutex_.Unlock();
return Status::OK(); return Status::OK();
} }
Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
// First get sorted files in archive dir, then append sorted files from main // First get sorted files in db dir, then get sorted files from archived
// dir to maintain sorted order // dir, to avoid a race condition where a log file is moved to archived
// dir in between.
Status s;
// list wal files in main db dir.
VectorLogPtr logs;
s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile);
if (!s.ok()) {
return s;
}
// Reproduce the race condition where a log file is moved
// to archived dir, between these two sync points, used in
// (DBTest,TransactionLogIteratorRace)
TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1");
TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2");
files.clear();
// list wal files in archive dir. // list wal files in archive dir.
Status s;
std::string archivedir = ArchivalDirectory(options_.wal_dir); std::string archivedir = ArchivalDirectory(options_.wal_dir);
if (env_->FileExists(archivedir)) { if (env_->FileExists(archivedir)) {
s = AppendSortedWalsOfType(archivedir, files, kArchivedLogFile); s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
} }
// list wal files in main db dir.
return AppendSortedWalsOfType(options_.wal_dir, files, kAliveLogFile); uint64_t latest_archived_log_number = 0;
if (!files.empty()) {
latest_archived_log_number = files.back()->LogNumber();
Log(options_.info_log, "Latest Archived log: %" PRIu64,
latest_archived_log_number);
}
files.reserve(files.size() + logs.size());
for (auto& log : logs) {
if (log->LogNumber() > latest_archived_log_number) {
files.push_back(std::move(log));
} else {
// When the race condition happens, we could see the
// same log in both db dir and archived dir. Simply
// ignore the one in db dir. Note that, if we read
// archived dir first, we would have missed the log file.
Log(options_.info_log, "%s already moved to archive",
log->PathName().c_str());
}
}
return s;
} }
} }

File diff suppressed because it is too large Load Diff

@ -13,10 +13,12 @@
#include <set> #include <set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <string>
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/log_writer.h" #include "db/log_writer.h"
#include "db/snapshot.h" #include "db/snapshot.h"
#include "db/column_family.h"
#include "db/version_edit.h" #include "db/version_edit.h"
#include "memtable_list.h" #include "memtable_list.h"
#include "port/port.h" #include "port/port.h"
@ -40,44 +42,79 @@ class CompactionFilterV2;
class DBImpl : public DB { class DBImpl : public DB {
public: public:
DBImpl(const Options& options, const std::string& dbname); DBImpl(const DBOptions& options, const std::string& dbname);
virtual ~DBImpl(); virtual ~DBImpl();
// Implementations of the DB interface // Implementations of the DB interface
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); using DB::Put;
virtual Status Merge(const WriteOptions&, const Slice& key, virtual Status Put(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value);
using DB::Merge;
virtual Status Merge(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value); const Slice& value);
virtual Status Delete(const WriteOptions&, const Slice& key); using DB::Delete;
virtual Status Delete(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key);
using DB::Write;
virtual Status Write(const WriteOptions& options, WriteBatch* updates); virtual Status Write(const WriteOptions& options, WriteBatch* updates);
using DB::Get;
virtual Status Get(const ReadOptions& options, virtual Status Get(const ReadOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
std::string* value); std::string* value);
virtual std::vector<Status> MultiGet(const ReadOptions& options, using DB::MultiGet;
const std::vector<Slice>& keys, virtual std::vector<Status> MultiGet(
std::vector<std::string>* values); const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values);
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
const std::string& column_family,
ColumnFamilyHandle** handle);
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
// Returns false if key doesn't exist in the database and true if it may. // Returns false if key doesn't exist in the database and true if it may.
// If value_found is not passed in as null, then return the value if found in // If value_found is not passed in as null, then return the value if found in
// memory. On return, if value was found, then value_found will be set to true // memory. On return, if value was found, then value_found will be set to true
// , otherwise false. // , otherwise false.
using DB::KeyMayExist;
virtual bool KeyMayExist(const ReadOptions& options, virtual bool KeyMayExist(const ReadOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
std::string* value, std::string* value, bool* value_found = nullptr);
bool* value_found = nullptr); using DB::NewIterator;
virtual Iterator* NewIterator(const ReadOptions&); virtual Iterator* NewIterator(const ReadOptions& options,
ColumnFamilyHandle* column_family);
virtual Status NewIterators(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_families,
std::vector<Iterator*>* iterators);
virtual const Snapshot* GetSnapshot(); virtual const Snapshot* GetSnapshot();
virtual void ReleaseSnapshot(const Snapshot* snapshot); virtual void ReleaseSnapshot(const Snapshot* snapshot);
virtual bool GetProperty(const Slice& property, std::string* value); using DB::GetProperty;
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); virtual bool GetProperty(ColumnFamilyHandle* column_family,
virtual Status CompactRange(const Slice* begin, const Slice* end, const Slice& property, std::string* value);
using DB::GetApproximateSizes;
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
const Range* range, int n, uint64_t* sizes);
using DB::CompactRange;
virtual Status CompactRange(ColumnFamilyHandle* column_family,
const Slice* begin, const Slice* end,
bool reduce_level = false, int target_level = -1); bool reduce_level = false, int target_level = -1);
virtual int NumberLevels();
virtual int MaxMemCompactionLevel(); using DB::NumberLevels;
virtual int Level0StopWriteTrigger(); virtual int NumberLevels(ColumnFamilyHandle* column_family);
using DB::MaxMemCompactionLevel;
virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family);
using DB::Level0StopWriteTrigger;
virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family);
virtual const std::string& GetName() const; virtual const std::string& GetName() const;
virtual Env* GetEnv() const; virtual Env* GetEnv() const;
virtual const Options& GetOptions() const; using DB::GetOptions;
virtual Status Flush(const FlushOptions& options); virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const;
using DB::Flush;
virtual Status Flush(const FlushOptions& options,
ColumnFamilyHandle* column_family);
virtual Status DisableFileDeletions(); virtual Status DisableFileDeletions();
virtual Status EnableFileDeletions(bool force); virtual Status EnableFileDeletions(bool force);
// All the returned filenames start with "/" // All the returned filenames start with "/"
@ -92,8 +129,7 @@ class DBImpl : public DB {
read_options = TransactionLogIterator::ReadOptions()); read_options = TransactionLogIterator::ReadOptions());
virtual Status DeleteFile(std::string name); virtual Status DeleteFile(std::string name);
virtual void GetLiveFilesMetaData( virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
std::vector<LiveFileMetaData> *metadata);
// checks if all live files exist on file system and that their file sizes // checks if all live files exist on file system and that their file sizes
// match to our in-memory records // match to our in-memory records
@ -101,23 +137,21 @@ class DBImpl : public DB {
virtual Status GetDbIdentity(std::string& identity); virtual Status GetDbIdentity(std::string& identity);
Status RunManualCompaction(int input_level, Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
int output_level, int output_level, const Slice* begin,
const Slice* begin,
const Slice* end); const Slice* end);
// Extra methods (for testing) that are not in the public DB interface // Extra methods (for testing) that are not in the public DB interface
// Compact any files in the named level that overlap [*begin, *end] // Compact any files in the named level that overlap [*begin, *end]
Status TEST_CompactRange(int level, Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
const Slice* begin, ColumnFamilyHandle* column_family = nullptr);
const Slice* end);
// Force current memtable contents to be flushed. // Force current memtable contents to be flushed.
Status TEST_FlushMemTable(bool wait = true); Status TEST_FlushMemTable(bool wait = true);
// Wait for memtable compaction // Wait for memtable compaction
Status TEST_WaitForFlushMemTable(); Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
// Wait for any compaction // Wait for any compaction
Status TEST_WaitForCompact(); Status TEST_WaitForCompact();
@ -125,14 +159,13 @@ class DBImpl : public DB {
// Return an internal iterator over the current state of the database. // Return an internal iterator over the current state of the database.
// The keys of this iterator are internal keys (see format.h). // The keys of this iterator are internal keys (see format.h).
// The returned iterator should be deleted when no longer needed. // The returned iterator should be deleted when no longer needed.
Iterator* TEST_NewInternalIterator(); Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family =
nullptr);
// Return the maximum overlapping data (in bytes) at next level for any // Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1. // file at a level >= 1.
int64_t TEST_MaxNextLevelOverlappingBytes(); int64_t TEST_MaxNextLevelOverlappingBytes(ColumnFamilyHandle* column_family =
nullptr);
// Simulate a db crash, no elegant closing of database.
void TEST_Destroy_DBImpl();
// Return the current manifest file no. // Return the current manifest file no.
uint64_t TEST_Current_Manifest_FileNo(); uint64_t TEST_Current_Manifest_FileNo();
@ -148,61 +181,8 @@ class DBImpl : public DB {
default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL; default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
} }
void TEST_GetFilesMetaData(std::vector<std::vector<FileMetaData>>* metadata); void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
std::vector<std::vector<FileMetaData>>* metadata);
// holds references to memtable, all immutable memtables and version
struct SuperVersion {
MemTable* mem;
MemTableListVersion* imm;
Version* current;
std::atomic<uint32_t> refs;
// We need to_delete because during Cleanup(), imm->Unref() returns
// all memtables that we need to free through this vector. We then
// delete all those memtables outside of mutex, during destruction
autovector<MemTable*> to_delete;
// Version number of the current SuperVersion
uint64_t version_number;
DBImpl* db;
// should be called outside the mutex
SuperVersion() = default;
~SuperVersion();
SuperVersion* Ref();
// Returns true if this was the last reference and caller should
// call Clenaup() and delete the object
bool Unref();
// call these two methods with db mutex held
// Cleanup unrefs mem, imm and current. Also, it stores all memtables
// that needs to be deleted in to_delete vector. Unrefing those
// objects needs to be done in the mutex
void Cleanup();
void Init(MemTable* new_mem, MemTableListVersion* new_imm,
Version* new_current);
// The value of dummy is not actually used. kSVInUse takes its address as a
// mark in the thread local storage to indicate the SuperVersion is in use
// by thread. This way, the value of kSVInUse is guaranteed to have no
// conflict with SuperVersion object address and portable on different
// platform.
static int dummy;
static void* const kSVInUse;
static void* const kSVObsolete;
};
static void SuperVersionUnrefHandle(void* ptr) {
// UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
// destroyed. When former happens, the thread shouldn't see kSVInUse.
// When latter happens, we are in ~DBImpl(), no get should happen as well.
assert(ptr != SuperVersion::kSVInUse);
DBImpl::SuperVersion* sv = static_cast<DBImpl::SuperVersion*>(ptr);
if (sv->Unref()) {
sv->db->mutex_.Lock();
sv->Cleanup();
sv->db->mutex_.Unlock();
delete sv;
}
}
// needed for CleanupIteratorState // needed for CleanupIteratorState
struct DeletionState { struct DeletionState {
@ -231,7 +211,7 @@ class DBImpl : public DB {
autovector<SuperVersion*> superversions_to_free; autovector<SuperVersion*> superversions_to_free;
SuperVersion* new_superversion; // if nullptr no new superversion SuperVersion* new_superversion; // if nullptr no new superversion
// the current manifest_file_number, log_number and prev_log_number // the current manifest_file_number, log_number and prev_log_number
// that corresponds to the set of files in 'live'. // that corresponds to the set of files in 'live'.
@ -243,8 +223,7 @@ class DBImpl : public DB {
pending_manifest_file_number = 0; pending_manifest_file_number = 0;
log_number = 0; log_number = 0;
prev_log_number = 0; prev_log_number = 0;
new_superversion = new_superversion = create_superversion ? new SuperVersion() : nullptr;
create_superversion ? new SuperVersion() : nullptr;
} }
~DeletionState() { ~DeletionState() {
@ -277,23 +256,16 @@ class DBImpl : public DB {
// It is not necessary to hold the mutex when invoking this method. // It is not necessary to hold the mutex when invoking this method.
void PurgeObsoleteFiles(DeletionState& deletion_state); void PurgeObsoleteFiles(DeletionState& deletion_state);
ColumnFamilyHandle* DefaultColumnFamily() const;
protected: protected:
Env* const env_; Env* const env_;
const std::string dbname_; const std::string dbname_;
unique_ptr<VersionSet> versions_; unique_ptr<VersionSet> versions_;
const InternalKeyComparator internal_comparator_; const DBOptions options_;
const Options options_; // options_.comparator == &internal_comparator_
const Comparator* user_comparator() const {
return internal_comparator_.user_comparator();
}
SuperVersion* GetSuperVersion() {
return super_version_;
}
Iterator* NewInternalIterator(const ReadOptions&, Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
SequenceNumber* latest_snapshot); SuperVersion* super_version);
private: private:
friend class DB; friend class DB;
@ -306,8 +278,10 @@ class DBImpl : public DB {
Status NewDB(); Status NewDB();
// Recover the descriptor from persistent storage. May do a significant // Recover the descriptor from persistent storage. May do a significant
// amount of work to recover recently logged updates. // amount of work to recover recently logged updates. Any changes to
Status Recover(bool read_only = false, bool error_if_log_file_exist = false); // be made to the descriptor are added to *edit.
Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
bool read_only = false, bool error_if_log_file_exist = false);
void MaybeIgnoreError(Status* s) const; void MaybeIgnoreError(Status* s) const;
@ -318,7 +292,7 @@ class DBImpl : public DB {
// Flush the in-memory write buffer to storage. Switches to a new // Flush the in-memory write buffer to storage. Switches to a new
// log-file/memtable and writes a new descriptor iff successful. // log-file/memtable and writes a new descriptor iff successful.
Status FlushMemTableToOutputFile(bool* madeProgress, Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress,
DeletionState& deletion_state, DeletionState& deletion_state,
LogBuffer* log_buffer); LogBuffer* log_buffer);
@ -330,25 +304,26 @@ class DBImpl : public DB {
// database is opened) and is heavyweight because it holds the mutex // database is opened) and is heavyweight because it holds the mutex
// for the entire period. The second method WriteLevel0Table supports // for the entire period. The second method WriteLevel0Table supports
// concurrent flush memtables to storage. // concurrent flush memtables to storage.
Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit); Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
Status WriteLevel0Table(autovector<MemTable*>& mems, VersionEdit* edit, VersionEdit* edit);
uint64_t* filenumber, Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
VersionEdit* edit, uint64_t* filenumber,
LogBuffer* log_buffer); LogBuffer* log_buffer);
uint64_t SlowdownAmount(int n, double bottom, double top); uint64_t SlowdownAmount(int n, double bottom, double top);
// MakeRoomForWrite will return superversion_to_free through an arugment,
// which the caller needs to delete. We do it because caller can delete // TODO(icanadi) free superversion_to_free and old_log outside of mutex
// the superversion outside of mutex Status MakeRoomForWrite(ColumnFamilyData* cfd,
Status MakeRoomForWrite(bool force /* compact even if there is room? */, bool force /* flush even if there is room? */);
SuperVersion** superversion_to_free);
void BuildBatchGroup(Writer** last_writer, void BuildBatchGroup(Writer** last_writer,
autovector<WriteBatch*>* write_batch_group); autovector<WriteBatch*>* write_batch_group);
// Force current memtable contents to be flushed. // Force current memtable contents to be flushed.
Status FlushMemTable(const FlushOptions& options); Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
// Wait for memtable flushed // Wait for memtable flushed
Status WaitForFlushMemTable(); Status WaitForFlushMemTable(ColumnFamilyData* cfd);
void MaybeScheduleLogDBDeployStats(); void MaybeScheduleLogDBDeployStats();
static void BGLogDBDeployStats(void* db); static void BGLogDBDeployStats(void* db);
@ -368,6 +343,13 @@ class DBImpl : public DB {
DeletionState& deletion_state, DeletionState& deletion_state,
LogBuffer* log_buffer); LogBuffer* log_buffer);
// This function is called as part of compaction. It enables Flush process to
// preempt compaction, since it's higher prioirty
// Returns: micros spent executing
uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
DeletionState& deletion_state,
LogBuffer* log_buffer);
// Call compaction filter if is_compaction_v2 is not true. Then iterate // Call compaction filter if is_compaction_v2 is not true. Then iterate
// through input and compact the kv-pairs // through input and compact the kv-pairs
Status ProcessKeyValueCompaction( Status ProcessKeyValueCompaction(
@ -388,15 +370,16 @@ class DBImpl : public DB {
Status OpenCompactionOutputFile(CompactionState* compact); Status OpenCompactionOutputFile(CompactionState* compact);
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
Status InstallCompactionResults(CompactionState* compact); Status InstallCompactionResults(CompactionState* compact,
LogBuffer* log_buffer);
void AllocateCompactionOutputFileNumbers(CompactionState* compact); void AllocateCompactionOutputFileNumbers(CompactionState* compact);
void ReleaseCompactionUnusedFileNumbers(CompactionState* compact); void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
void PurgeObsoleteWALFiles(); void PurgeObsoleteWALFiles();
Status AppendSortedWalsOfType(const std::string& path, Status GetSortedWalsOfType(const std::string& path,
VectorLogPtr& log_files, VectorLogPtr& log_files,
WalFileType type); WalFileType type);
// Requires: all_logs should be sorted with earliest log file first // Requires: all_logs should be sorted with earliest log file first
// Retains all log files in all_logs which contain updates with seq no. // Retains all log files in all_logs which contain updates with seq no.
@ -419,30 +402,23 @@ class DBImpl : public DB {
// Return the minimum empty level that could hold the total data in the // Return the minimum empty level that could hold the total data in the
// input level. Return the input level, if such level could not be found. // input level. Return the input level, if such level could not be found.
int FindMinimumEmptyLevelFitting(int level); int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level);
// Move the files in the input level to the target level. // Move the files in the input level to the target level.
// If target_level < 0, automatically calculate the minimum level that could // If target_level < 0, automatically calculate the minimum level that could
// hold the data set. // hold the data set.
Status ReFitLevel(int level, int target_level = -1); Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
// Returns the current SuperVersion number.
uint64_t CurrentVersionNumber() const;
// Returns a pair of iterators (mutable-only and immutable-only) used // Returns a pair of iterators (mutable-only and immutable-only) used
// internally by TailingIterator and stores CurrentVersionNumber() in // internally by TailingIterator and stores cfd->GetSuperVersionNumber() in
// *superversion_number. These iterators are always up-to-date, i.e. can // *superversion_number. These iterators are always up-to-date, i.e. can
// be used to read new data. // be used to read new data.
std::pair<Iterator*, Iterator*> GetTailingIteratorPair( std::pair<Iterator*, Iterator*> GetTailingIteratorPair(
const ReadOptions& options, const ReadOptions& options, ColumnFamilyData* cfd,
uint64_t* superversion_number); uint64_t* superversion_number);
// Constant after construction
const InternalFilterPolicy internal_filter_policy_;
bool owns_info_log_;
// table_cache_ provides its own synchronization // table_cache_ provides its own synchronization
unique_ptr<TableCache> table_cache_; std::shared_ptr<Cache> table_cache_;
// Lock over the persistent DB state. Non-nullptr iff successfully acquired. // Lock over the persistent DB state. Non-nullptr iff successfully acquired.
FileLock* db_lock_; FileLock* db_lock_;
@ -451,20 +427,11 @@ class DBImpl : public DB {
port::Mutex mutex_; port::Mutex mutex_;
port::AtomicPointer shutting_down_; port::AtomicPointer shutting_down_;
port::CondVar bg_cv_; // Signalled when background work finishes port::CondVar bg_cv_; // Signalled when background work finishes
MemTable* mem_;
MemTableList imm_; // Memtable that are not changing
uint64_t logfile_number_; uint64_t logfile_number_;
unique_ptr<log::Writer> log_; unique_ptr<log::Writer> log_;
ColumnFamilyHandleImpl* default_cf_handle_;
SuperVersion* super_version_; unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
std::deque<uint64_t> alive_log_files_;
// An ordinal representing the current SuperVersion. Updated by
// InstallSuperVersion(), i.e. incremented every time super_version_
// changes.
std::atomic<uint64_t> super_version_number_;
// Thread's local copy of SuperVersion pointer
// This needs to be destructed after mutex_
ThreadLocalPtr* local_sv_;
std::string host_name_; std::string host_name_;
@ -500,6 +467,7 @@ class DBImpl : public DB {
// Information for a manual compaction // Information for a manual compaction
struct ManualCompaction { struct ManualCompaction {
ColumnFamilyData* cfd;
int input_level; int input_level;
int output_level; int output_level;
bool done; bool done;
@ -541,8 +509,6 @@ class DBImpl : public DB {
bool flush_on_destroy_; // Used when disableWAL is true. bool flush_on_destroy_; // Used when disableWAL is true.
InternalStats internal_stats_;
static const int KEEP_LOG_FILE_NUM = 1000; static const int KEEP_LOG_FILE_NUM = 1000;
std::string db_absolute_path_; std::string db_absolute_path_;
@ -575,28 +541,21 @@ class DBImpl : public DB {
std::vector<SequenceNumber>& snapshots, std::vector<SequenceNumber>& snapshots,
SequenceNumber* prev_snapshot); SequenceNumber* prev_snapshot);
// will return a pointer to SuperVersion* if previous SuperVersion
// if its reference count is zero and needs deletion or nullptr if not
// As argument takes a pointer to allocated SuperVersion
// Foreground threads call this function directly (they don't carry
// deletion state and have to handle their own creation and deletion
// of SuperVersion)
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion);
// Background threads call this function, which is just a wrapper around // Background threads call this function, which is just a wrapper around
// the InstallSuperVersion() function above. Background threads carry // the cfd->InstallSuperVersion() function. Background threads carry
// deletion_state which can have new_superversion already allocated. // deletion_state which can have new_superversion already allocated.
void InstallSuperVersion(DeletionState& deletion_state); void InstallSuperVersion(ColumnFamilyData* cfd,
DeletionState& deletion_state);
void ResetThreadLocalSuperVersions(DeletionState* deletion_state); using DB::GetPropertiesOfAllTables;
virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) TablePropertiesCollection* props)
override; override;
// Function that Get and KeyMayExist call with no_io true or false // Function that Get and KeyMayExist call with no_io true or false
// Note: 'value_found' from KeyMayExist propagates here // Note: 'value_found' from KeyMayExist propagates here
Status GetImpl(const ReadOptions& options, Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, const Slice& key, std::string* value,
std::string* value,
bool* value_found = nullptr); bool* value_found = nullptr);
}; };
@ -606,7 +565,7 @@ extern Options SanitizeOptions(const std::string& db,
const InternalKeyComparator* icmp, const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy, const InternalFilterPolicy* ipolicy,
const Options& src); const Options& src);
extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
// Determine compression type, based on user options, level of the output // Determine compression type, based on user options, level of the output
// file and whether compression is disabled. // file and whether compression is disabled.

@ -42,8 +42,8 @@
namespace rocksdb { namespace rocksdb {
DBImplReadOnly::DBImplReadOnly(const Options& options, DBImplReadOnly::DBImplReadOnly(const DBOptions& options,
const std::string& dbname) const std::string& dbname)
: DBImpl(options, dbname) { : DBImpl(options, dbname) {
Log(options_.info_log, "Opening the db in read only mode"); Log(options_.info_log, "Opening the db in read only mode");
} }
@ -53,42 +53,57 @@ DBImplReadOnly::~DBImplReadOnly() {
// Implementations of the DB interface // Implementations of the DB interface
Status DBImplReadOnly::Get(const ReadOptions& options, Status DBImplReadOnly::Get(const ReadOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) { std::string* value) {
Status s; Status s;
SequenceNumber snapshot = versions_->LastSequence(); SequenceNumber snapshot = versions_->LastSequence();
SuperVersion* super_version = GetSuperVersion(); auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
SuperVersion* super_version = cfd->GetSuperVersion();
MergeContext merge_context; MergeContext merge_context;
LookupKey lkey(key, snapshot); LookupKey lkey(key, snapshot);
if (super_version->mem->Get(lkey, value, &s, merge_context, options_)) { if (super_version->mem->Get(lkey, value, &s, merge_context,
*cfd->options())) {
} else { } else {
Version::GetStats stats; Version::GetStats stats;
super_version->current->Get(options, lkey, value, &s, &merge_context, super_version->current->Get(options, lkey, value, &s, &merge_context,
&stats, options_); &stats, *cfd->options());
} }
return s; return s;
} }
Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options) { Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options,
SequenceNumber latest_snapshot; ColumnFamilyHandle* column_family) {
Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
SequenceNumber latest_snapshot = versions_->LastSequence();
Iterator* internal_iter = NewInternalIterator(options, cfd, super_version);
return NewDBIterator( return NewDBIterator(
&dbname_, env_, options_, user_comparator(),internal_iter, &dbname_, env_, *cfd->options(), cfd->user_comparator(), internal_iter,
(options.snapshot != nullptr (options.snapshot != nullptr
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_ ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
: latest_snapshot)); : latest_snapshot));
} }
Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
DB** dbptr, bool error_if_log_file_exist) { DB** dbptr, bool error_if_log_file_exist) {
*dbptr = nullptr; *dbptr = nullptr;
DBImplReadOnly* impl = new DBImplReadOnly(options, dbname); DBOptions db_options(options);
ColumnFamilyOptions cf_options(options);
std::vector<ColumnFamilyDescriptor> column_families;
column_families.push_back(
ColumnFamilyDescriptor(default_column_family_name, cf_options));
DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
impl->mutex_.Lock(); impl->mutex_.Lock();
Status s = impl->Recover(true /* read only */, error_if_log_file_exist); Status s = impl->Recover(column_families, true /* read only */,
error_if_log_file_exist);
if (s.ok()) { if (s.ok()) {
delete impl->InstallSuperVersion(new DBImpl::SuperVersion()); for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
}
} }
impl->mutex_.Unlock(); impl->mutex_.Unlock();
if (s.ok()) { if (s.ok()) {

@ -12,6 +12,8 @@
#include <deque> #include <deque>
#include <set> #include <set>
#include <vector>
#include <string>
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/log_writer.h" #include "db/log_writer.h"
#include "db/snapshot.h" #include "db/snapshot.h"
@ -23,57 +25,79 @@
namespace rocksdb { namespace rocksdb {
class DBImplReadOnly : public DBImpl { class DBImplReadOnly : public DBImpl {
public: public:
DBImplReadOnly(const Options& options, const std::string& dbname); DBImplReadOnly(const DBOptions& options, const std::string& dbname);
virtual ~DBImplReadOnly(); virtual ~DBImplReadOnly();
// Implementations of the DB interface // Implementations of the DB interface
virtual Status Get(const ReadOptions& options, using DB::Get;
const Slice& key, virtual Status Get(const ReadOptions& options,
std::string* value); ColumnFamilyHandle* column_family, const Slice& key,
std::string* value);
// TODO: Implement ReadOnly MultiGet? // TODO: Implement ReadOnly MultiGet?
virtual Iterator* NewIterator(const ReadOptions&); using DBImpl::NewIterator;
virtual Iterator* NewIterator(const ReadOptions&,
ColumnFamilyHandle* column_family);
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value) { virtual Status NewIterators(
return Status::NotSupported("Not supported operation in read only mode."); const ReadOptions& options,
} const std::vector<ColumnFamilyHandle*>& column_family,
virtual Status Merge(const WriteOptions&, const Slice& key, std::vector<Iterator*>* iterators) {
const Slice& value) { // TODO
return Status::NotSupported("Not supported operation in read only mode."); return Status::NotSupported("Not supported yet.");
} }
virtual Status Delete(const WriteOptions&, const Slice& key) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status CompactRange(const Slice* begin, const Slice* end,
bool reduce_level = false, int target_level = -1) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status DisableFileDeletions() {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status EnableFileDeletions(bool force) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status GetLiveFiles(std::vector<std::string>&,
uint64_t* manifest_file_size,
bool flush_memtable = true) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status Flush(const FlushOptions& options) {
return Status::NotSupported("Not supported operation in read only mode.");
}
private: using DBImpl::Put;
friend class DB; virtual Status Put(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) {
return Status::NotSupported("Not supported operation in read only mode.");
}
using DBImpl::Merge;
virtual Status Merge(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) {
return Status::NotSupported("Not supported operation in read only mode.");
}
using DBImpl::Delete;
virtual Status Delete(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
return Status::NotSupported("Not supported operation in read only mode.");
}
using DBImpl::CompactRange;
virtual Status CompactRange(ColumnFamilyHandle* column_family,
const Slice* begin, const Slice* end,
bool reduce_level = false,
int target_level = -1) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status DisableFileDeletions() {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status EnableFileDeletions(bool force) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status GetLiveFiles(std::vector<std::string>&,
uint64_t* manifest_file_size,
bool flush_memtable = true) {
return Status::NotSupported("Not supported operation in read only mode.");
}
using DBImpl::Flush;
virtual Status Flush(const FlushOptions& options,
ColumnFamilyHandle* column_family) {
return Status::NotSupported("Not supported operation in read only mode.");
}
// No copying allowed private:
DBImplReadOnly(const DBImplReadOnly&); friend class DB;
void operator=(const DBImplReadOnly&);
};
// No copying allowed
DBImplReadOnly(const DBImplReadOnly&);
void operator=(const DBImplReadOnly&);
};
} }

@ -39,71 +39,6 @@ static void DumpInternalIter(Iterator* iter) {
namespace { namespace {
class IterLookupKey {
public:
IterLookupKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {}
~IterLookupKey() { Clear(); }
Slice GetKey() const {
if (key_ != nullptr) {
return Slice(key_, key_size_);
} else {
return Slice();
}
}
bool Valid() const { return key_ != nullptr; }
void Clear() {
if (key_ != nullptr && key_ != space_) {
delete[] key_;
}
key_ = space_;
buf_size_ = sizeof(buf_size_);
}
// Enlarge the buffer size if needed based on key_size.
// By default, static allocated buffer is used. Once there is a key
// larger than the static allocated buffer, another buffer is dynamically
// allocated, until a larger key buffer is requested. In that case, we
// reallocate buffer and delete the old one.
void EnlargeBufferIfNeeded(size_t key_size) {
// If size is smaller than buffer size, continue using current buffer,
// or the static allocated one, as default
if (key_size > buf_size_) {
// Need to enlarge the buffer.
Clear();
key_ = new char[key_size];
buf_size_ = key_size;
}
key_size_ = key_size;
}
void SetUserKey(const Slice& user_key) {
size_t size = user_key.size();
EnlargeBufferIfNeeded(size);
memcpy(key_, user_key.data(), size);
}
void SetInternalKey(const Slice& user_key, SequenceNumber s) {
size_t usize = user_key.size();
EnlargeBufferIfNeeded(usize + sizeof(uint64_t));
memcpy(key_, user_key.data(), usize);
EncodeFixed64(key_ + usize, PackSequenceAndType(s, kValueTypeForSeek));
}
private:
char* key_;
size_t buf_size_;
size_t key_size_;
char space_[32]; // Avoid allocation for short keys
// No copying allowed
IterLookupKey(const IterLookupKey&) = delete;
void operator=(const LookupKey&) = delete;
};
// Memtables and sstables that make the DB representation contain // Memtables and sstables that make the DB representation contain
// (userkey,seq,type) => uservalue entries. DBIter // (userkey,seq,type) => uservalue entries. DBIter
// combines multiple entries for the same userkey found in the DB // combines multiple entries for the same userkey found in the DB
@ -191,7 +126,7 @@ class DBIter: public Iterator {
SequenceNumber const sequence_; SequenceNumber const sequence_;
Status status_; Status status_;
IterLookupKey saved_key_; // == current key when direction_==kReverse IterKey saved_key_; // == current key when direction_==kReverse
std::string saved_value_; // == current raw value when direction_==kReverse std::string saved_value_; // == current raw value when direction_==kReverse
std::string skip_key_; std::string skip_key_;
Direction direction_; Direction direction_;
@ -254,10 +189,9 @@ void DBIter::Next() {
// NOTE: In between, saved_key_ can point to a user key that has // NOTE: In between, saved_key_ can point to a user key that has
// a delete marker // a delete marker
inline void DBIter::FindNextUserEntry(bool skipping) { inline void DBIter::FindNextUserEntry(bool skipping) {
StopWatchNano timer(env_, false); PERF_TIMER_AUTO(find_next_user_entry_time);
StartPerfTimer(&timer);
FindNextUserEntryInternal(skipping); FindNextUserEntryInternal(skipping);
BumpPerfTime(&perf_context.find_next_user_entry_time, &timer); PERF_TIMER_STOP(find_next_user_entry_time);
} }
// Actual implementation of DBIter::FindNextUserEntry() // Actual implementation of DBIter::FindNextUserEntry()
@ -273,7 +207,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
if (skipping && if (skipping &&
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
num_skipped++; // skip this entry num_skipped++; // skip this entry
BumpPerfCount(&perf_context.internal_key_skipped_count); PERF_COUNTER_ADD(internal_key_skipped_count, 1);
} else { } else {
skipping = false; skipping = false;
switch (ikey.type) { switch (ikey.type) {
@ -283,7 +217,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
saved_key_.SetUserKey(ikey.user_key); saved_key_.SetUserKey(ikey.user_key);
skipping = true; skipping = true;
num_skipped = 0; num_skipped = 0;
BumpPerfCount(&perf_context.internal_delete_skipped_count); PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
break; break;
case kTypeValue: case kTypeValue:
valid_ = true; valid_ = true;
@ -488,10 +422,9 @@ void DBIter::Seek(const Slice& target) {
saved_key_.Clear(); saved_key_.Clear();
// now savved_key is used to store internal key. // now savved_key is used to store internal key.
saved_key_.SetInternalKey(target, sequence_); saved_key_.SetInternalKey(target, sequence_);
StopWatchNano internal_seek_timer(env_, false); PERF_TIMER_AUTO(seek_internal_seek_time);
StartPerfTimer(&internal_seek_timer);
iter_->Seek(saved_key_.GetKey()); iter_->Seek(saved_key_.GetKey());
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); PERF_TIMER_STOP(seek_internal_seek_time);
if (iter_->Valid()) { if (iter_->Valid()) {
direction_ = kForward; direction_ = kForward;
ClearSavedValue(); ClearSavedValue();
@ -504,10 +437,9 @@ void DBIter::Seek(const Slice& target) {
void DBIter::SeekToFirst() { void DBIter::SeekToFirst() {
direction_ = kForward; direction_ = kForward;
ClearSavedValue(); ClearSavedValue();
StopWatchNano internal_seek_timer(env_, false); PERF_TIMER_AUTO(seek_internal_seek_time);
StartPerfTimer(&internal_seek_timer);
iter_->SeekToFirst(); iter_->SeekToFirst();
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); PERF_TIMER_STOP(seek_internal_seek_time);
if (iter_->Valid()) { if (iter_->Valid()) {
FindNextUserEntry(false /* not skipping */); FindNextUserEntry(false /* not skipping */);
} else { } else {
@ -526,10 +458,9 @@ void DBIter::SeekToLast() {
direction_ = kReverse; direction_ = kReverse;
ClearSavedValue(); ClearSavedValue();
StopWatchNano internal_seek_timer(env_, false); PERF_TIMER_AUTO(seek_internal_seek_time);
StartPerfTimer(&internal_seek_timer);
iter_->SeekToLast(); iter_->SeekToLast();
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); PERF_TIMER_STOP(seek_internal_seek_time);
FindPrevUserEntry(); FindPrevUserEntry();
} }

@ -65,7 +65,7 @@ void DBImpl::LogDBDeployStats() {
uint64_t file_total_size = 0; uint64_t file_total_size = 0;
uint32_t file_total_num = 0; uint32_t file_total_num = 0;
Version* current = versions_->current(); Version* current = default_cf_handle_->cfd()->current();
for (int i = 0; i < current->NumberLevels(); i++) { for (int i = 0; i < current->NumberLevels(); i++) {
file_total_num += current->NumLevelFiles(i); file_total_num += current->NumLevelFiles(i);
file_total_size += current->NumLevelBytes(i); file_total_size += current->NumLevelBytes(i);

File diff suppressed because it is too large Load Diff

@ -59,7 +59,7 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
// decreasing sequence number // decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate) // decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
BumpPerfCount(&perf_context.user_key_comparison_count); PERF_COUNTER_ADD(user_key_comparison_count, 1);
if (r == 0) { if (r == 0) {
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
@ -79,7 +79,7 @@ int InternalKeyComparator::Compare(const ParsedInternalKey& a,
// decreasing sequence number // decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate) // decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_->Compare(a.user_key, b.user_key); int r = user_comparator_->Compare(a.user_key, b.user_key);
BumpPerfCount(&perf_context.user_key_comparison_count); PERF_COUNTER_ADD(user_key_comparison_count, 1);
if (r == 0) { if (r == 0) {
if (a.sequence > b.sequence) { if (a.sequence > b.sequence) {
r = -1; r = -1;

@ -32,6 +32,9 @@ enum ValueType : unsigned char {
kTypeValue = 0x1, kTypeValue = 0x1,
kTypeMerge = 0x2, kTypeMerge = 0x2,
kTypeLogData = 0x3, kTypeLogData = 0x3,
kTypeColumnFamilyDeletion = 0x4,
kTypeColumnFamilyValue = 0x5,
kTypeColumnFamilyMerge = 0x6,
kMaxValue = 0x7F kMaxValue = 0x7F
}; };
@ -235,4 +238,74 @@ inline LookupKey::~LookupKey() {
if (start_ != space_) delete[] start_; if (start_ != space_) delete[] start_;
} }
class IterKey {
public:
IterKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {}
~IterKey() { Clear(); }
Slice GetKey() const {
if (key_ != nullptr) {
return Slice(key_, key_size_);
} else {
return Slice();
}
}
bool Valid() const { return key_ != nullptr; }
void Clear() {
if (key_ != nullptr && key_ != space_) {
delete[] key_;
}
key_ = space_;
buf_size_ = sizeof(buf_size_);
}
// Enlarge the buffer size if needed based on key_size.
// By default, static allocated buffer is used. Once there is a key
// larger than the static allocated buffer, another buffer is dynamically
// allocated, until a larger key buffer is requested. In that case, we
// reallocate buffer and delete the old one.
void EnlargeBufferIfNeeded(size_t key_size) {
// If size is smaller than buffer size, continue using current buffer,
// or the static allocated one, as default
if (key_size > buf_size_) {
// Need to enlarge the buffer.
Clear();
key_ = new char[key_size];
buf_size_ = key_size;
}
key_size_ = key_size;
}
void SetUserKey(const Slice& user_key) {
size_t size = user_key.size();
EnlargeBufferIfNeeded(size);
memcpy(key_, user_key.data(), size);
}
void SetInternalKey(const Slice& user_key, SequenceNumber s,
ValueType value_type = kValueTypeForSeek) {
size_t usize = user_key.size();
EnlargeBufferIfNeeded(usize + sizeof(uint64_t));
memcpy(key_, user_key.data(), usize);
EncodeFixed64(key_ + usize, PackSequenceAndType(s, value_type));
}
void SetInternalKey(const ParsedInternalKey& parsed_key) {
SetInternalKey(parsed_key.user_key, parsed_key.sequence, parsed_key.type);
}
private:
char* key_;
size_t buf_size_;
size_t key_size_;
char space_[32]; // Avoid allocation for short keys
// No copying allowed
IterKey(const IterKey&) = delete;
void operator=(const IterKey&) = delete;
};
} // namespace rocksdb } // namespace rocksdb

@ -7,8 +7,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/internal_stats.h" #include "db/internal_stats.h"
#include "db/db_impl.h" #include "db/column_family.h"
#include "db/memtable_list.h"
#include <vector> #include <vector>
@ -44,10 +43,8 @@ DBPropertyType GetPropertyType(const Slice& property) {
bool InternalStats::GetProperty(DBPropertyType property_type, bool InternalStats::GetProperty(DBPropertyType property_type,
const Slice& property, std::string* value, const Slice& property, std::string* value,
DBImpl* db) { ColumnFamilyData* cfd) {
VersionSet* version_set = db->versions_.get(); Version* current = cfd->current();
Version* current = version_set->current();
const MemTableList& imm = db->imm_;
Slice in = property; Slice in = property;
switch (property_type) { switch (property_type) {
@ -110,7 +107,6 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
write_with_wal = statistics_->getTickerCount(WRITE_WITH_WAL); write_with_wal = statistics_->getTickerCount(WRITE_WITH_WAL);
} }
// Pardon the long line but I think it is easier to read this way.
snprintf( snprintf(
buf, sizeof(buf), buf, sizeof(buf),
" Compactions\n" " Compactions\n"
@ -159,7 +155,7 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
"%9lu\n", "%9lu\n",
level, files, current->NumLevelBytes(level) / 1048576.0, level, files, current->NumLevelBytes(level) / 1048576.0,
current->NumLevelBytes(level) / current->NumLevelBytes(level) /
version_set->MaxBytesForLevel(level), cfd->compaction_picker()->MaxBytesForLevel(level),
compaction_stats_[level].micros / 1e6, compaction_stats_[level].micros / 1e6,
bytes_read / 1048576.0, bytes_read / 1048576.0,
compaction_stats_[level].bytes_written / 1048576.0, compaction_stats_[level].bytes_written / 1048576.0,
@ -334,11 +330,11 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
*value = current->DebugString(); *value = current->DebugString();
return true; return true;
case kNumImmutableMemTable: case kNumImmutableMemTable:
*value = std::to_string(imm.size()); *value = std::to_string(cfd->imm()->size());
return true; return true;
case kMemtableFlushPending: case kMemtableFlushPending:
// Return number of mem tables that are ready to flush (made immutable) // Return number of mem tables that are ready to flush (made immutable)
*value = std::to_string(imm.IsFlushPending() ? 1 : 0); *value = std::to_string(cfd->imm()->IsFlushPending() ? 1 : 0);
return true; return true;
case kCompactionPending: case kCompactionPending:
// 1 if the system already determines at least one compacdtion is needed. // 1 if the system already determines at least one compacdtion is needed.
@ -351,7 +347,7 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
return true; return true;
case kCurSizeActiveMemTable: case kCurSizeActiveMemTable:
// Current size of the active memtable // Current size of the active memtable
*value = std::to_string(db->mem_->ApproximateMemoryUsage()); *value = std::to_string(cfd->mem()->ApproximateMemoryUsage());
return true; return true;
default: default:
return false; return false;

@ -16,6 +16,8 @@
#include <vector> #include <vector>
#include <string> #include <string>
class ColumnFamilyData;
namespace rocksdb { namespace rocksdb {
class MemTableList; class MemTableList;
@ -126,7 +128,7 @@ class InternalStats {
uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; } uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
bool GetProperty(DBPropertyType property_type, const Slice& property, bool GetProperty(DBPropertyType property_type, const Slice& property,
std::string* value, DBImpl* db); std::string* value, ColumnFamilyData* cfd);
private: private:
std::vector<CompactionStats> compaction_stats_; std::vector<CompactionStats> compaction_stats_;

@ -29,7 +29,8 @@
namespace rocksdb { namespace rocksdb {
MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) MemTable::MemTable(const InternalKeyComparator& cmp,
const Options& options)
: comparator_(cmp), : comparator_(cmp),
refs_(0), refs_(0),
kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)), kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)),
@ -42,7 +43,6 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
file_number_(0), file_number_(0),
first_seqno_(0), first_seqno_(0),
mem_next_logfile_number_(0), mem_next_logfile_number_(0),
mem_logfile_number_(0),
locks_(options.inplace_update_support ? options.inplace_update_num_locks locks_(options.inplace_update_support ? options.inplace_update_num_locks
: 0), : 0),
prefix_extractor_(options.prefix_extractor.get()), prefix_extractor_(options.prefix_extractor.get()),
@ -142,6 +142,11 @@ Slice MemTableRep::UserKey(const char* key) const {
return Slice(slice.data(), slice.size() - 8); return Slice(slice.data(), slice.size() - 8);
} }
KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
*buf = arena_->Allocate(len);
return static_cast<KeyHandle>(*buf);
}
// Encode a suitable internal key target for "target" and return it. // Encode a suitable internal key target for "target" and return it.
// Uses *scratch as scratch space, and the returned pointer will point // Uses *scratch as scratch space, and the returned pointer will point
// into this scratch space. // into this scratch space.
@ -243,7 +248,9 @@ void MemTable::Add(SequenceNumber s, ValueType type,
const size_t encoded_len = const size_t encoded_len =
VarintLength(internal_key_size) + internal_key_size + VarintLength(internal_key_size) + internal_key_size +
VarintLength(val_size) + val_size; VarintLength(val_size) + val_size;
char* buf = arena_.Allocate(encoded_len); char* buf = nullptr;
KeyHandle handle = table_->Allocate(encoded_len, &buf);
assert(buf != nullptr);
char* p = EncodeVarint32(buf, internal_key_size); char* p = EncodeVarint32(buf, internal_key_size);
memcpy(p, key.data(), key_size); memcpy(p, key.data(), key_size);
p += key_size; p += key_size;
@ -252,7 +259,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
p = EncodeVarint32(p, val_size); p = EncodeVarint32(p, val_size);
memcpy(p, value.data(), val_size); memcpy(p, value.data(), val_size);
assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
table_->Insert(buf); table_->Insert(handle);
if (prefix_bloom_) { if (prefix_bloom_) {
assert(prefix_extractor_); assert(prefix_extractor_);
@ -370,8 +377,7 @@ static bool SaveValue(void* arg, const char* entry) {
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
MergeContext& merge_context, const Options& options) { MergeContext& merge_context, const Options& options) {
StopWatchNano memtable_get_timer(options.env, false); PERF_TIMER_AUTO(get_from_memtable_time);
StartPerfTimer(&memtable_get_timer);
Slice user_key = key.user_key(); Slice user_key = key.user_key();
bool found_final_value = false; bool found_final_value = false;
@ -401,8 +407,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
if (!found_final_value && merge_in_progress) { if (!found_final_value && merge_in_progress) {
*s = Status::MergeInProgress(""); *s = Status::MergeInProgress("");
} }
BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer); PERF_TIMER_STOP(get_from_memtable_time);
BumpPerfCount(&perf_context.get_from_memtable_count); PERF_COUNTER_ADD(get_from_memtable_count, 1);
return found_final_value; return found_final_value;
} }

@ -13,7 +13,7 @@
#include <deque> #include <deque>
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/skiplist.h" #include "db/skiplist.h"
#include "db/version_set.h" #include "db/version_edit.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/memtablerep.h" #include "rocksdb/memtablerep.h"
#include "util/arena.h" #include "util/arena.h"
@ -39,7 +39,7 @@ class MemTable {
// MemTables are reference counted. The initial reference count // MemTables are reference counted. The initial reference count
// is zero and the caller must call Ref() at least once. // is zero and the caller must call Ref() at least once.
explicit MemTable(const InternalKeyComparator& comparator, explicit MemTable(const InternalKeyComparator& comparator,
const Options& options = Options()); const Options& options);
~MemTable(); ~MemTable();
@ -147,14 +147,6 @@ class MemTable {
// be flushed to storage // be flushed to storage
void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; } void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
// Returns the logfile number that can be safely deleted when this
// memstore is flushed to storage
uint64_t GetLogNumber() { return mem_logfile_number_; }
// Sets the logfile number that can be safely deleted when this
// memstore is flushed to storage
void SetLogNumber(uint64_t num) { mem_logfile_number_ = num; }
// Notify the underlying storage that no more items will be added // Notify the underlying storage that no more items will be added
void MarkImmutable() { table_->MarkReadOnly(); } void MarkImmutable() { table_->MarkReadOnly(); }
@ -197,10 +189,6 @@ class MemTable {
// The log files earlier than this number can be deleted. // The log files earlier than this number can be deleted.
uint64_t mem_next_logfile_number_; uint64_t mem_next_logfile_number_;
// The log file that backs this memtable (to be deleted when
// memtable flush is done)
uint64_t mem_logfile_number_;
// rw locks for inplace updates // rw locks for inplace updates
std::vector<port::RWMutex> locks_; std::vector<port::RWMutex> locks_;

@ -8,9 +8,11 @@
#include <string> #include <string>
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "db/memtable.h" #include "db/memtable.h"
#include "db/version_set.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/iterator.h" #include "rocksdb/iterator.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/log_buffer.h"
namespace rocksdb { namespace rocksdb {
@ -120,7 +122,8 @@ void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
} }
void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems, void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
uint64_t file_number, std::set<uint64_t>* pending_outputs) { uint64_t file_number,
std::set<uint64_t>* pending_outputs) {
assert(!mems.empty()); assert(!mems.empty());
// If the flush was not successful, then just reset state. // If the flush was not successful, then just reset state.
@ -140,10 +143,10 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
// Record a successful flush in the manifest file // Record a successful flush in the manifest file
Status MemTableList::InstallMemtableFlushResults( Status MemTableList::InstallMemtableFlushResults(
const autovector<MemTable*>& mems, VersionSet* vset, ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
port::Mutex* mu, Logger* info_log, uint64_t file_number, port::Mutex* mu, Logger* info_log, uint64_t file_number,
std::set<uint64_t>& pending_outputs, autovector<MemTable*>* to_delete, std::set<uint64_t>& pending_outputs, autovector<MemTable*>* to_delete,
Directory* db_directory) { Directory* db_directory, LogBuffer* log_buffer) {
mu->AssertHeld(); mu->AssertHeld();
// flush was sucessful // flush was sucessful
@ -173,12 +176,11 @@ Status MemTableList::InstallMemtableFlushResults(
break; break;
} }
Log(info_log, LogToBuffer(log_buffer, "Level-0 commit table #%lu started",
"Level-0 commit table #%lu started", (unsigned long)m->file_number_);
(unsigned long)m->file_number_);
// this can release and reacquire the mutex. // this can release and reacquire the mutex.
s = vset->LogAndApply(&m->edit_, mu, db_directory); s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory);
// we will be changing the version in the next code path, // we will be changing the version in the next code path,
// so we better create a new one, since versions are immutable // so we better create a new one, since versions are immutable
@ -189,10 +191,8 @@ Status MemTableList::InstallMemtableFlushResults(
uint64_t mem_id = 1; // how many memtables has been flushed. uint64_t mem_id = 1; // how many memtables has been flushed.
do { do {
if (s.ok()) { // commit new state if (s.ok()) { // commit new state
Log(info_log, LogToBuffer(log_buffer, "Level-0 commit table #%lu: memtable #%lu done",
"Level-0 commit table #%lu: memtable #%lu done", (unsigned long)m->file_number_, (unsigned long)mem_id);
(unsigned long)m->file_number_,
(unsigned long)mem_id);
current_->Remove(m); current_->Remove(m);
assert(m->file_number_ > 0); assert(m->file_number_ > 0);

@ -7,19 +7,25 @@
#include <string> #include <string>
#include <list> #include <list>
#include <vector>
#include <set>
#include <deque> #include <deque>
#include "rocksdb/db.h"
#include "rocksdb/options.h"
#include "rocksdb/iterator.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/memtable.h"
#include "db/skiplist.h" #include "db/skiplist.h"
#include "rocksdb/db.h" #include "db/memtable.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/iterator.h" #include "rocksdb/iterator.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "util/autovector.h" #include "util/autovector.h"
#include "util/log_buffer.h"
namespace rocksdb { namespace rocksdb {
class ColumnFamilyData;
class InternalKeyComparator; class InternalKeyComparator;
class Mutex; class Mutex;
@ -99,12 +105,14 @@ class MemTableList {
std::set<uint64_t>* pending_outputs); std::set<uint64_t>* pending_outputs);
// Commit a successful flush in the manifest file // Commit a successful flush in the manifest file
Status InstallMemtableFlushResults(const autovector<MemTable*>& m, Status InstallMemtableFlushResults(ColumnFamilyData* cfd,
const autovector<MemTable*>& m,
VersionSet* vset, port::Mutex* mu, VersionSet* vset, port::Mutex* mu,
Logger* info_log, uint64_t file_number, Logger* info_log, uint64_t file_number,
std::set<uint64_t>& pending_outputs, std::set<uint64_t>& pending_outputs,
autovector<MemTable*>* to_delete, autovector<MemTable*>* to_delete,
Directory* db_directory); Directory* db_directory,
LogBuffer* log_buffer);
// New memtables are inserted at the front of the list. // New memtables are inserted at the front of the list.
// Takes ownership of the referenced held on *m by the caller of Add(). // Takes ownership of the referenced held on *m by the caller of Add().

@ -429,6 +429,48 @@ TEST(PlainTableDBTest, Iterator) {
} }
} }
std::string MakeLongKey(size_t length, char c) {
return std::string(length, c);
}
TEST(PlainTableDBTest, IteratorLargeKeys) {
Options options = CurrentOptions();
options.table_factory.reset(NewTotalOrderPlainTableFactory(0, 0, 16));
options.create_if_missing = true;
options.prefix_extractor.reset();
DestroyAndReopen(&options);
std::string key_list[] = {
MakeLongKey(30, '0'),
MakeLongKey(16, '1'),
MakeLongKey(32, '2'),
MakeLongKey(60, '3'),
MakeLongKey(90, '4'),
MakeLongKey(50, '5'),
MakeLongKey(26, '6')
};
for (size_t i = 0; i < 7; i++) {
ASSERT_OK(Put(key_list[i], std::to_string(i)));
}
dbfull()->TEST_FlushMemTable();
Iterator* iter = dbfull()->NewIterator(ro_);
iter->Seek(key_list[0]);
for (size_t i = 0; i < 7; i++) {
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(key_list[i], iter->key().ToString());
ASSERT_EQ(std::to_string(i), iter->value().ToString());
iter->Next();
}
ASSERT_TRUE(!iter->Valid());
delete iter;
}
// A test comparator which compare two strings in this way: // A test comparator which compare two strings in this way:
// (1) first compare prefix of 8 bytes in alphabet order, // (1) first compare prefix of 8 bytes in alphabet order,
// (2) if two strings share the same prefix, sort the other part of the string // (2) if two strings share the same prefix, sort the other part of the string

@ -55,14 +55,20 @@ class Repairer {
icmp_(options.comparator), icmp_(options.comparator),
ipolicy_(options.filter_policy), ipolicy_(options.filter_policy),
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
raw_table_cache_(
// TableCache can be small since we expect each table to be opened
// once.
NewLRUCache(10, options_.table_cache_numshardbits,
options_.table_cache_remove_scan_count_limit)),
next_file_number_(1) { next_file_number_(1) {
// TableCache can be small since we expect each table to be opened once. table_cache_ = new TableCache(dbname_, &options_, storage_options_,
table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10); raw_table_cache_.get());
edit_ = new VersionEdit(); edit_ = new VersionEdit();
} }
~Repairer() { ~Repairer() {
delete table_cache_; delete table_cache_;
raw_table_cache_.reset();
delete edit_; delete edit_;
} }
@ -102,6 +108,7 @@ class Repairer {
InternalKeyComparator const icmp_; InternalKeyComparator const icmp_;
InternalFilterPolicy const ipolicy_; InternalFilterPolicy const ipolicy_;
Options const options_; Options const options_;
std::shared_ptr<Cache> raw_table_cache_;
TableCache* table_cache_; TableCache* table_cache_;
VersionEdit* edit_; VersionEdit* edit_;
@ -197,6 +204,7 @@ class Repairer {
Slice record; Slice record;
WriteBatch batch; WriteBatch batch;
MemTable* mem = new MemTable(icmp_, options_); MemTable* mem = new MemTable(icmp_, options_);
auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
mem->Ref(); mem->Ref();
int counter = 0; int counter = 0;
while (reader.ReadRecord(&record, &scratch)) { while (reader.ReadRecord(&record, &scratch)) {
@ -206,7 +214,7 @@ class Repairer {
continue; continue;
} }
WriteBatchInternal::SetContents(&batch, record); WriteBatchInternal::SetContents(&batch, record);
status = WriteBatchInternal::InsertInto(&batch, mem, &options_); status = WriteBatchInternal::InsertInto(&batch, cf_mems_default);
if (status.ok()) { if (status.ok()) {
counter += WriteBatchInternal::Count(&batch); counter += WriteBatchInternal::Count(&batch);
} else { } else {
@ -226,6 +234,7 @@ class Repairer {
iter, &meta, icmp_, 0, 0, kNoCompression); iter, &meta, icmp_, 0, 0, kNoCompression);
delete iter; delete iter;
delete mem->Unref(); delete mem->Unref();
delete cf_mems_default;
mem = nullptr; mem = nullptr;
if (status.ok()) { if (status.ok()) {
if (meta.file_size > 0) { if (meta.file_size > 0) {

@ -35,18 +35,13 @@ static Slice GetSliceForFileNumber(uint64_t* file_number) {
sizeof(*file_number)); sizeof(*file_number));
} }
TableCache::TableCache(const std::string& dbname, TableCache::TableCache(const std::string& dbname, const Options* options,
const Options* options, const EnvOptions& storage_options, Cache* const cache)
const EnvOptions& storage_options,
int entries)
: env_(options->env), : env_(options->env),
dbname_(dbname), dbname_(dbname),
options_(options), options_(options),
storage_options_(storage_options), storage_options_(storage_options),
cache_( cache_(cache) {}
NewLRUCache(entries, options->table_cache_numshardbits,
options->table_cache_remove_scan_count_limit)) {
}
TableCache::~TableCache() { TableCache::~TableCache() {
} }
@ -124,7 +119,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
TableReader* table_reader = GetTableReaderFromHandle(handle); TableReader* table_reader = GetTableReaderFromHandle(handle);
Iterator* result = table_reader->NewIterator(options); Iterator* result = table_reader->NewIterator(options);
if (!file_meta.table_reader_handle) { if (!file_meta.table_reader_handle) {
result->RegisterCleanup(&UnrefEntry, cache_.get(), handle); result->RegisterCleanup(&UnrefEntry, cache_, handle);
} }
if (table_reader_ptr != nullptr) { if (table_reader_ptr != nullptr) {
*table_reader_ptr = table_reader; *table_reader_ptr = table_reader;
@ -216,8 +211,8 @@ bool TableCache::PrefixMayMatch(const ReadOptions& options,
return may_match; return may_match;
} }
void TableCache::Evict(uint64_t file_number) { void TableCache::Evict(Cache* cache, uint64_t file_number) {
cache_->Erase(GetSliceForFileNumber(&file_number)); cache->Erase(GetSliceForFileNumber(&file_number));
} }
} // namespace rocksdb } // namespace rocksdb

@ -30,7 +30,7 @@ struct FileMetaData;
class TableCache { class TableCache {
public: public:
TableCache(const std::string& dbname, const Options* options, TableCache(const std::string& dbname, const Options* options,
const EnvOptions& storage_options, int entries); const EnvOptions& storage_options, Cache* cache);
~TableCache(); ~TableCache();
// Return an iterator for the specified file number (the corresponding // Return an iterator for the specified file number (the corresponding
@ -64,7 +64,7 @@ class TableCache {
const Slice& internal_prefix, bool* table_io); const Slice& internal_prefix, bool* table_io);
// Evict any entry for the specified file number // Evict any entry for the specified file number
void Evict(uint64_t file_number); static void Evict(Cache* cache, uint64_t file_number);
// Find table reader // Find table reader
Status FindTable(const EnvOptions& toptions, Status FindTable(const EnvOptions& toptions,
@ -95,7 +95,7 @@ class TableCache {
const std::string dbname_; const std::string dbname_;
const Options* options_; const Options* options_;
const EnvOptions& storage_options_; const EnvOptions& storage_options_;
std::shared_ptr<Cache> cache_; Cache* const cache_;
}; };
} // namespace rocksdb } // namespace rocksdb

@ -8,15 +8,19 @@
#include <string> #include <string>
#include <utility> #include <utility>
#include "db/db_impl.h" #include "db/db_impl.h"
#include "db/column_family.h"
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h" #include "rocksdb/slice_transform.h"
namespace rocksdb { namespace rocksdb {
TailingIterator::TailingIterator(DBImpl* db, const ReadOptions& options, TailingIterator::TailingIterator(DBImpl* db, const ReadOptions& options,
const Comparator* comparator) ColumnFamilyData* cfd)
: db_(db), options_(options), comparator_(comparator), : db_(db),
version_number_(0), current_(nullptr), options_(options),
cfd_(cfd),
version_number_(0),
current_(nullptr),
status_(Status::InvalidArgument("Seek() not called on this iterator")) {} status_(Status::InvalidArgument("Seek() not called on this iterator")) {}
bool TailingIterator::Valid() const { bool TailingIterator::Valid() const {
@ -53,10 +57,9 @@ void TailingIterator::Seek(const Slice& target) {
// 'target' -- in this case, prev_key_ is included in the interval, so // 'target' -- in this case, prev_key_ is included in the interval, so
// prev_inclusive_ has to be set. // prev_inclusive_ has to be set.
if (!is_prev_set_ || const Comparator* cmp = cfd_->user_comparator();
comparator_->Compare(prev_key_, target) >= !is_prev_inclusive_ || if (!is_prev_set_ || cmp->Compare(prev_key_, target) >= !is_prev_inclusive_ ||
(immutable_->Valid() && (immutable_->Valid() && cmp->Compare(target, immutable_->key()) > 0) ||
comparator_->Compare(target, immutable_->key()) > 0) ||
(options_.prefix_seek && !IsSamePrefix(target))) { (options_.prefix_seek && !IsSamePrefix(target))) {
SeekImmutable(target); SeekImmutable(target);
} }
@ -121,7 +124,7 @@ void TailingIterator::SeekToLast() {
void TailingIterator::CreateIterators() { void TailingIterator::CreateIterators() {
std::pair<Iterator*, Iterator*> iters = std::pair<Iterator*, Iterator*> iters =
db_->GetTailingIteratorPair(options_, &version_number_); db_->GetTailingIteratorPair(options_, cfd_, &version_number_);
assert(iters.first && iters.second); assert(iters.first && iters.second);
@ -137,9 +140,10 @@ void TailingIterator::UpdateCurrent() {
if (mutable_->Valid()) { if (mutable_->Valid()) {
current_ = mutable_.get(); current_ = mutable_.get();
} }
const Comparator* cmp = cfd_->user_comparator();
if (immutable_->Valid() && if (immutable_->Valid() &&
(current_ == nullptr || (current_ == nullptr ||
comparator_->Compare(immutable_->key(), current_->key()) < 0)) { cmp->Compare(immutable_->key(), current_->key()) < 0)) {
current_ = immutable_.get(); current_ = immutable_.get();
} }
@ -151,11 +155,11 @@ void TailingIterator::UpdateCurrent() {
bool TailingIterator::IsCurrentVersion() const { bool TailingIterator::IsCurrentVersion() const {
return mutable_ != nullptr && immutable_ != nullptr && return mutable_ != nullptr && immutable_ != nullptr &&
version_number_ == db_->CurrentVersionNumber(); version_number_ == cfd_->GetSuperVersionNumber();
} }
bool TailingIterator::IsSamePrefix(const Slice& target) const { bool TailingIterator::IsSamePrefix(const Slice& target) const {
const SliceTransform* extractor = db_->options_.prefix_extractor.get(); const SliceTransform* extractor = cfd_->options()->prefix_extractor.get();
assert(extractor); assert(extractor);
assert(is_prev_set_); assert(is_prev_set_);

@ -13,6 +13,7 @@
namespace rocksdb { namespace rocksdb {
class DBImpl; class DBImpl;
class ColumnFamilyData;
/** /**
* TailingIterator is a special type of iterator that doesn't use an (implicit) * TailingIterator is a special type of iterator that doesn't use an (implicit)
@ -25,7 +26,7 @@ class DBImpl;
class TailingIterator : public Iterator { class TailingIterator : public Iterator {
public: public:
TailingIterator(DBImpl* db, const ReadOptions& options, TailingIterator(DBImpl* db, const ReadOptions& options,
const Comparator* comparator); ColumnFamilyData* cfd);
virtual ~TailingIterator() {} virtual ~TailingIterator() {}
virtual bool Valid() const override; virtual bool Valid() const override;
@ -41,7 +42,7 @@ class TailingIterator : public Iterator {
private: private:
DBImpl* const db_; DBImpl* const db_;
const ReadOptions options_; const ReadOptions options_;
const Comparator* const comparator_; ColumnFamilyData* const cfd_;
uint64_t version_number_; uint64_t version_number_;
// TailingIterator merges the contents of the two iterators below (one using // TailingIterator merges the contents of the two iterators below (one using

@ -9,7 +9,7 @@
namespace rocksdb { namespace rocksdb {
TransactionLogIteratorImpl::TransactionLogIteratorImpl( TransactionLogIteratorImpl::TransactionLogIteratorImpl(
const std::string& dir, const Options* options, const std::string& dir, const DBOptions* options,
const TransactionLogIterator::ReadOptions& read_options, const TransactionLogIterator::ReadOptions& read_options,
const EnvOptions& soptions, const SequenceNumber seq, const EnvOptions& soptions, const SequenceNumber seq,
std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl) std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl)

@ -67,7 +67,7 @@ class LogFileImpl : public LogFile {
class TransactionLogIteratorImpl : public TransactionLogIterator { class TransactionLogIteratorImpl : public TransactionLogIterator {
public: public:
TransactionLogIteratorImpl( TransactionLogIteratorImpl(
const std::string& dir, const Options* options, const std::string& dir, const DBOptions* options,
const TransactionLogIterator::ReadOptions& read_options, const TransactionLogIterator::ReadOptions& read_options,
const EnvOptions& soptions, const SequenceNumber seqNum, const EnvOptions& soptions, const SequenceNumber seqNum,
std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl); std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl);
@ -82,7 +82,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
private: private:
const std::string& dir_; const std::string& dir_;
const Options* options_; const DBOptions* options_;
const TransactionLogIterator::ReadOptions read_options_; const TransactionLogIterator::ReadOptions read_options_;
const EnvOptions& soptions_; const EnvOptions& soptions_;
SequenceNumber startingSequenceNumber_; SequenceNumber startingSequenceNumber_;

@ -11,6 +11,7 @@
#include "db/version_set.h" #include "db/version_set.h"
#include "util/coding.h" #include "util/coding.h"
#include "rocksdb/slice.h"
namespace rocksdb { namespace rocksdb {
@ -29,6 +30,11 @@ enum Tag {
// these are new formats divergent from open source leveldb // these are new formats divergent from open source leveldb
kNewFile2 = 100, // store smallest & largest seqno kNewFile2 = 100, // store smallest & largest seqno
kColumnFamily = 200, // specify column family for version edit
kColumnFamilyAdd = 201,
kColumnFamilyDrop = 202,
kMaxColumnFamily = 203,
}; };
void VersionEdit::Clear() { void VersionEdit::Clear() {
@ -38,13 +44,19 @@ void VersionEdit::Clear() {
prev_log_number_ = 0; prev_log_number_ = 0;
last_sequence_ = 0; last_sequence_ = 0;
next_file_number_ = 0; next_file_number_ = 0;
max_column_family_ = 0;
has_comparator_ = false; has_comparator_ = false;
has_log_number_ = false; has_log_number_ = false;
has_prev_log_number_ = false; has_prev_log_number_ = false;
has_next_file_number_ = false; has_next_file_number_ = false;
has_last_sequence_ = false; has_last_sequence_ = false;
has_max_column_family_ = false;
deleted_files_.clear(); deleted_files_.clear();
new_files_.clear(); new_files_.clear();
column_family_ = 0;
is_column_family_add_ = 0;
is_column_family_drop_ = 0;
column_family_name_.clear();
} }
void VersionEdit::EncodeTo(std::string* dst) const { void VersionEdit::EncodeTo(std::string* dst) const {
@ -68,6 +80,10 @@ void VersionEdit::EncodeTo(std::string* dst) const {
PutVarint32(dst, kLastSequence); PutVarint32(dst, kLastSequence);
PutVarint64(dst, last_sequence_); PutVarint64(dst, last_sequence_);
} }
if (has_max_column_family_) {
PutVarint32(dst, kMaxColumnFamily);
PutVarint32(dst, max_column_family_);
}
for (const auto& deleted : deleted_files_) { for (const auto& deleted : deleted_files_) {
PutVarint32(dst, kDeletedFile); PutVarint32(dst, kDeletedFile);
@ -86,6 +102,21 @@ void VersionEdit::EncodeTo(std::string* dst) const {
PutVarint64(dst, f.smallest_seqno); PutVarint64(dst, f.smallest_seqno);
PutVarint64(dst, f.largest_seqno); PutVarint64(dst, f.largest_seqno);
} }
// 0 is default and does not need to be explicitly written
if (column_family_ != 0) {
PutVarint32(dst, kColumnFamily);
PutVarint32(dst, column_family_);
}
if (is_column_family_add_) {
PutVarint32(dst, kColumnFamilyAdd);
PutLengthPrefixedSlice(dst, Slice(column_family_name_));
}
if (is_column_family_drop_) {
PutVarint32(dst, kColumnFamilyDrop);
}
} }
static bool GetInternalKey(Slice* input, InternalKey* dst) { static bool GetInternalKey(Slice* input, InternalKey* dst) {
@ -167,6 +198,14 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
} }
break; break;
case kMaxColumnFamily:
if (GetVarint32(&input, &max_column_family_)) {
has_max_column_family_ = true;
} else {
msg = "max column family";
}
break;
case kCompactPointer: case kCompactPointer:
if (GetLevel(&input, &level, &msg) && if (GetLevel(&input, &level, &msg) &&
GetInternalKey(&input, &key)) { GetInternalKey(&input, &key)) {
@ -221,6 +260,29 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
} }
break; break;
case kColumnFamily:
if (!GetVarint32(&input, &column_family_)) {
if (!msg) {
msg = "set column family id";
}
}
break;
case kColumnFamilyAdd:
if (GetLengthPrefixedSlice(&input, &str)) {
is_column_family_add_ = true;
column_family_name_ = str.ToString();
} else {
if (!msg) {
msg = "column family add";
}
}
break;
case kColumnFamilyDrop:
is_column_family_drop_ = true;
break;
default: default:
msg = "unknown tag"; msg = "unknown tag";
break; break;
@ -282,6 +344,19 @@ std::string VersionEdit::DebugString(bool hex_key) const {
r.append(" .. "); r.append(" .. ");
r.append(f.largest.DebugString(hex_key)); r.append(f.largest.DebugString(hex_key));
} }
r.append("\n ColumnFamily: ");
AppendNumberTo(&r, column_family_);
if (is_column_family_add_) {
r.append("\n ColumnFamilyAdd: ");
r.append(column_family_name_);
}
if (is_column_family_drop_) {
r.append("\n ColumnFamilyDrop");
}
if (has_max_column_family_) {
r.append("\n MaxColumnFamily: ");
AppendNumberTo(&r, max_column_family_);
}
r.append("\n}\n"); r.append("\n}\n");
return r; return r;
} }

@ -11,6 +11,7 @@
#include <set> #include <set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <string>
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "db/dbformat.h" #include "db/dbformat.h"
@ -32,11 +33,14 @@ struct FileMetaData {
// Needs to be disposed when refs becomes 0. // Needs to be disposed when refs becomes 0.
Cache::Handle* table_reader_handle; Cache::Handle* table_reader_handle;
FileMetaData(uint64_t number, uint64_t file_size) : FileMetaData(uint64_t number, uint64_t file_size)
refs(0), allowed_seeks(1 << 30), number(number), file_size(file_size), : refs(0),
being_compacted(false), table_reader_handle(nullptr) { allowed_seeks(1 << 30),
} number(number),
FileMetaData() : FileMetaData(0, 0) { } file_size(file_size),
being_compacted(false),
table_reader_handle(nullptr) {}
FileMetaData() : FileMetaData(0, 0) {}
}; };
class VersionEdit { class VersionEdit {
@ -66,6 +70,10 @@ class VersionEdit {
has_last_sequence_ = true; has_last_sequence_ = true;
last_sequence_ = seq; last_sequence_ = seq;
} }
void SetMaxColumnFamily(uint32_t max_column_family) {
has_max_column_family_ = true;
max_column_family_ = max_column_family;
}
// Add the specified file at the specified number. // Add the specified file at the specified number.
// REQUIRES: This version has not been saved (see VersionSet::SaveTo) // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
@ -97,6 +105,31 @@ class VersionEdit {
return new_files_.size() + deleted_files_.size(); return new_files_.size() + deleted_files_.size();
} }
bool IsColumnFamilyManipulation() {
return is_column_family_add_ || is_column_family_drop_;
}
void SetColumnFamily(uint32_t column_family_id) {
column_family_ = column_family_id;
}
// set column family ID by calling SetColumnFamily()
void AddColumnFamily(const std::string& name) {
assert(!is_column_family_drop_);
assert(!is_column_family_add_);
assert(NumEntries() == 0);
is_column_family_add_ = true;
column_family_name_ = name;
}
// set column family ID by calling SetColumnFamily()
void DropColumnFamily() {
assert(!is_column_family_drop_);
assert(!is_column_family_add_);
assert(NumEntries() == 0);
is_column_family_drop_ = true;
}
void EncodeTo(std::string* dst) const; void EncodeTo(std::string* dst) const;
Status DecodeFrom(const Slice& src); Status DecodeFrom(const Slice& src);
@ -114,15 +147,27 @@ class VersionEdit {
uint64_t log_number_; uint64_t log_number_;
uint64_t prev_log_number_; uint64_t prev_log_number_;
uint64_t next_file_number_; uint64_t next_file_number_;
uint32_t max_column_family_;
SequenceNumber last_sequence_; SequenceNumber last_sequence_;
bool has_comparator_; bool has_comparator_;
bool has_log_number_; bool has_log_number_;
bool has_prev_log_number_; bool has_prev_log_number_;
bool has_next_file_number_; bool has_next_file_number_;
bool has_last_sequence_; bool has_last_sequence_;
bool has_max_column_family_;
DeletedFileSet deleted_files_; DeletedFileSet deleted_files_;
std::vector<std::pair<int, FileMetaData> > new_files_; std::vector<std::pair<int, FileMetaData>> new_files_;
// Each version edit record should have column_family_id set
// If it's not set, it is default (0)
uint32_t column_family_;
// a version edit can be either column_family add or
// column_family drop. If it's column family add,
// it also includes column family name.
bool is_column_family_drop_;
bool is_column_family_add_;
std::string column_family_name_;
}; };
} // namespace rocksdb } // namespace rocksdb

@ -45,6 +45,19 @@ TEST(VersionEditTest, EncodeDecode) {
TestEncodeDecode(edit); TestEncodeDecode(edit);
} }
TEST(VersionEditTest, ColumnFamilyTest) {
VersionEdit edit;
edit.SetColumnFamily(2);
edit.AddColumnFamily("column_family");
edit.SetMaxColumnFamily(5);
TestEncodeDecode(edit);
edit.Clear();
edit.SetColumnFamily(3);
edit.DropColumnFamily();
TestEncodeDecode(edit);
}
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

File diff suppressed because it is too large Load Diff

@ -24,12 +24,15 @@
#include <vector> #include <vector>
#include <deque> #include <deque>
#include <atomic> #include <atomic>
#include <limits>
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/version_edit.h" #include "db/version_edit.h"
#include "port/port.h" #include "port/port.h"
#include "db/table_cache.h" #include "db/table_cache.h"
#include "db/compaction.h" #include "db/compaction.h"
#include "db/compaction_picker.h" #include "db/compaction_picker.h"
#include "db/column_family.h"
#include "db/log_reader.h"
namespace rocksdb { namespace rocksdb {
@ -41,10 +44,12 @@ class Iterator;
class LogBuffer; class LogBuffer;
class LookupKey; class LookupKey;
class MemTable; class MemTable;
class MergeContext;
class TableCache;
class Version; class Version;
class VersionSet; class VersionSet;
class MergeContext;
class ColumnFamilyData;
class ColumnFamilySet;
class TableCache;
// Return the smallest index i such that files[i]->largest >= key. // Return the smallest index i such that files[i]->largest >= key.
// Return files.size() if there is no such file. // Return files.size() if there is no such file.
@ -208,6 +213,7 @@ class Version {
friend class Compaction; friend class Compaction;
friend class VersionSet; friend class VersionSet;
friend class DBImpl; friend class DBImpl;
friend class ColumnFamilyData;
friend class CompactionPicker; friend class CompactionPicker;
friend class LevelCompactionPicker; friend class LevelCompactionPicker;
friend class UniversalCompactionPicker; friend class UniversalCompactionPicker;
@ -223,6 +229,7 @@ class Version {
// record results in files_by_size_. The largest files are listed first. // record results in files_by_size_. The largest files are listed first.
void UpdateFilesBySize(); void UpdateFilesBySize();
ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
VersionSet* vset_; // VersionSet to which this Version belongs VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list Version* next_; // Next version in linked list
Version* prev_; // Previous version in linked list Version* prev_; // Previous version in linked list
@ -268,7 +275,7 @@ class Version {
// used for debugging and logging purposes only. // used for debugging and logging purposes only.
uint64_t version_number_; uint64_t version_number_;
explicit Version(VersionSet* vset, uint64_t version_number = 0); Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
~Version(); ~Version();
@ -285,22 +292,29 @@ class Version {
class VersionSet { class VersionSet {
public: public:
VersionSet(const std::string& dbname, const Options* options, VersionSet(const std::string& dbname, const DBOptions* options,
const EnvOptions& storage_options, TableCache* table_cache, const EnvOptions& storage_options, Cache* table_cache);
const InternalKeyComparator*);
~VersionSet(); ~VersionSet();
// Apply *edit to the current version to form a new descriptor that // Apply *edit to the current version to form a new descriptor that
// is both saved to persistent state and installed as the new // is both saved to persistent state and installed as the new
// current version. Will release *mu while actually writing to the file. // current version. Will release *mu while actually writing to the file.
// column_family_options has to be set if edit is column family add
// REQUIRES: *mu is held on entry. // REQUIRES: *mu is held on entry.
// REQUIRES: no other thread concurrently calls LogAndApply() // REQUIRES: no other thread concurrently calls LogAndApply()
Status LogAndApply(VersionEdit* edit, port::Mutex* mu, Status LogAndApply(ColumnFamilyData* column_family_data, VersionEdit* edit,
Directory* db_directory = nullptr, port::Mutex* mu, Directory* db_directory = nullptr,
bool new_descriptor_log = false); bool new_descriptor_log = false,
const ColumnFamilyOptions* column_family_options =
nullptr);
// Recover the last saved descriptor from persistent storage. // Recover the last saved descriptor from persistent storage.
Status Recover(); Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families);
// Reads a manifest file and returns a list of column families in
// column_families.
static Status ListColumnFamilies(std::vector<std::string>* column_families,
const std::string& dbname, Env* env);
// Try to reduce the number of levels. This call is valid when // Try to reduce the number of levels. This call is valid when
// only one level from the new max level to the old // only one level from the new max level to the old
@ -316,15 +330,6 @@ class VersionSet {
const EnvOptions& storage_options, const EnvOptions& storage_options,
int new_levels); int new_levels);
// Return the current version.
Version* current() const { return current_; }
// A Flag indicating whether write needs to slowdown because of there are
// too many number of level0 files.
bool NeedSlowdownForNumLevel0Files() const {
return need_slowdown_for_num_level0_files_;
}
// Return the current manifest file number // Return the current manifest file number
uint64_t ManifestFileNumber() const { return manifest_file_number_; } uint64_t ManifestFileNumber() const { return manifest_file_number_; }
@ -358,37 +363,21 @@ class VersionSet {
// Mark the specified file number as used. // Mark the specified file number as used.
void MarkFileNumberUsed(uint64_t number); void MarkFileNumberUsed(uint64_t number);
// Return the current log file number.
uint64_t LogNumber() const { return log_number_; }
// Return the log file number for the log file that is currently // Return the log file number for the log file that is currently
// being compacted, or zero if there is no such log file. // being compacted, or zero if there is no such log file.
uint64_t PrevLogNumber() const { return prev_log_number_; } uint64_t PrevLogNumber() const { return prev_log_number_; }
int NumberLevels() const { return num_levels_; } // Returns the minimum log number such that all
// log numbers less than or equal to it can be deleted
// Pick level and inputs for a new compaction. uint64_t MinLogNumber() const {
// Returns nullptr if there is no compaction to be done. uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
// Otherwise returns a pointer to a heap-allocated object that for (auto cfd : *column_family_set_) {
// describes the compaction. Caller should delete the result. if (min_log_num > cfd->GetLogNumber()) {
Compaction* PickCompaction(LogBuffer* log_buffer); min_log_num = cfd->GetLogNumber();
}
// Return a compaction object for compacting the range [begin,end] in }
// the specified level. Returns nullptr if there is nothing in that return min_log_num;
// level that overlaps the specified range. Caller should delete }
// the result.
//
// The returned Compaction might not include the whole requested range.
// In that case, compaction_end will be set to the next key that needs
// compacting. In case the compaction will compact the whole range,
// compaction_end will be set to nullptr.
// Client is responsible for compaction_end storage -- when called,
// *compaction_end should point to valid InternalKey!
Compaction* CompactRange(int input_level,
int output_level,
const InternalKey* begin,
const InternalKey* end,
InternalKey** compaction_end);
// Create an iterator that reads over the compaction inputs for "*c". // Create an iterator that reads over the compaction inputs for "*c".
// The caller should delete the iterator when no longer needed. // The caller should delete the iterator when no longer needed.
@ -414,62 +403,53 @@ class VersionSet {
// pick the same files to compact. // pick the same files to compact.
bool VerifyCompactionFileConsistency(Compaction* c); bool VerifyCompactionFileConsistency(Compaction* c);
double MaxBytesForLevel(int level); Status GetMetadataForFile(uint64_t number, int* filelevel,
FileMetaData** metadata, ColumnFamilyData** cfd);
// Get the max file size in a given level.
uint64_t MaxFileSizeForLevel(int level);
void ReleaseCompactionFiles(Compaction* c, Status status);
Status GetMetadataForFile(
uint64_t number, int *filelevel, FileMetaData **metadata);
void GetLiveFilesMetaData( void GetLiveFilesMetaData(
std::vector<LiveFileMetaData> *metadata); std::vector<LiveFileMetaData> *metadata);
void GetObsoleteFiles(std::vector<FileMetaData*>* files); void GetObsoleteFiles(std::vector<FileMetaData*>* files);
ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
private: private:
class Builder; class Builder;
struct ManifestWriter; struct ManifestWriter;
friend class Compaction;
friend class Version; friend class Version;
struct LogReporter : public log::Reader::Reporter {
Status* status;
virtual void Corruption(size_t bytes, const Status& s) {
if (this->status->ok()) *this->status = s;
}
};
// Save current contents to *log // Save current contents to *log
Status WriteSnapshot(log::Writer* log); Status WriteSnapshot(log::Writer* log);
void AppendVersion(Version* v); void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
bool ManifestContains(uint64_t manifest_file_number, bool ManifestContains(uint64_t manifest_file_number,
const std::string& record) const; const std::string& record) const;
ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options,
VersionEdit* edit);
std::unique_ptr<ColumnFamilySet> column_family_set_;
Env* const env_; Env* const env_;
const std::string dbname_; const std::string dbname_;
const Options* const options_; const DBOptions* const options_;
TableCache* const table_cache_;
const InternalKeyComparator icmp_;
uint64_t next_file_number_; uint64_t next_file_number_;
uint64_t manifest_file_number_; uint64_t manifest_file_number_;
uint64_t pending_manifest_file_number_; uint64_t pending_manifest_file_number_;
std::atomic<uint64_t> last_sequence_; std::atomic<uint64_t> last_sequence_;
uint64_t log_number_;
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
int num_levels_;
// Opened lazily // Opened lazily
unique_ptr<log::Writer> descriptor_log_; unique_ptr<log::Writer> descriptor_log_;
Version dummy_versions_; // Head of circular doubly-linked list of versions.
Version* current_; // == dummy_versions_.prev_
// A flag indicating whether we should delay writes because
// we have too many level 0 files
bool need_slowdown_for_num_level0_files_;
// An object that keeps all the compaction stats
// and picks the next compaction
std::unique_ptr<CompactionPicker> compaction_picker_;
// generates a increasing version number for every new version // generates a increasing version number for every new version
uint64_t current_version_number_; uint64_t current_version_number_;
@ -493,8 +473,9 @@ class VersionSet {
VersionSet(const VersionSet&); VersionSet(const VersionSet&);
void operator=(const VersionSet&); void operator=(const VersionSet&);
void LogAndApplyHelper(Builder*b, Version* v, void LogAndApplyCFHelper(VersionEdit* edit);
VersionEdit* edit, port::Mutex* mu); void LogAndApplyHelper(ColumnFamilyData* cfd, Builder* b, Version* v,
VersionEdit* edit, port::Mutex* mu);
}; };
} // namespace rocksdb } // namespace rocksdb

@ -15,6 +15,9 @@
// kTypeValue varstring varstring // kTypeValue varstring varstring
// kTypeMerge varstring varstring // kTypeMerge varstring varstring
// kTypeDeletion varstring // kTypeDeletion varstring
// kTypeColumnFamilyValue varint32 varstring varstring
// kTypeColumnFamilyMerge varint32 varstring varstring
// kTypeColumnFamilyDeletion varint32 varstring varstring
// varstring := // varstring :=
// len: varint32 // len: varint32
// data: uint8[len] // data: uint8[len]
@ -45,10 +48,20 @@ WriteBatch::~WriteBatch() { }
WriteBatch::Handler::~Handler() { } WriteBatch::Handler::~Handler() { }
void WriteBatch::Handler::Put(const Slice& key, const Slice& value) {
// you need to either implement Put or PutCF
throw std::runtime_error("Handler::Put not implemented!");
}
void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) { void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) {
throw std::runtime_error("Handler::Merge not implemented!"); throw std::runtime_error("Handler::Merge not implemented!");
} }
void WriteBatch::Handler::Delete(const Slice& key) {
// you need to either implement Delete or DeleteCF
throw std::runtime_error("Handler::Delete not implemented!");
}
void WriteBatch::Handler::LogData(const Slice& blob) { void WriteBatch::Handler::LogData(const Slice& blob) {
// If the user has not specified something to do with blobs, then we ignore // If the user has not specified something to do with blobs, then we ignore
// them. // them.
@ -76,31 +89,48 @@ Status WriteBatch::Iterate(Handler* handler) const {
input.remove_prefix(kHeader); input.remove_prefix(kHeader);
Slice key, value, blob; Slice key, value, blob;
int found = 0; int found = 0;
while (!input.empty() && handler->Continue()) { Status s;
while (s.ok() && !input.empty() && handler->Continue()) {
char tag = input[0]; char tag = input[0];
input.remove_prefix(1); input.remove_prefix(1);
uint32_t column_family = 0; // default
switch (tag) { switch (tag) {
case kTypeColumnFamilyValue:
if (!GetVarint32(&input, &column_family)) {
return Status::Corruption("bad WriteBatch Put");
}
// intentional fallthrough
case kTypeValue: case kTypeValue:
if (GetLengthPrefixedSlice(&input, &key) && if (GetLengthPrefixedSlice(&input, &key) &&
GetLengthPrefixedSlice(&input, &value)) { GetLengthPrefixedSlice(&input, &value)) {
handler->Put(key, value); s = handler->PutCF(column_family, key, value);
found++; found++;
} else { } else {
return Status::Corruption("bad WriteBatch Put"); return Status::Corruption("bad WriteBatch Put");
} }
break; break;
case kTypeColumnFamilyDeletion:
if (!GetVarint32(&input, &column_family)) {
return Status::Corruption("bad WriteBatch Delete");
}
// intentional fallthrough
case kTypeDeletion: case kTypeDeletion:
if (GetLengthPrefixedSlice(&input, &key)) { if (GetLengthPrefixedSlice(&input, &key)) {
handler->Delete(key); s = handler->DeleteCF(column_family, key);
found++; found++;
} else { } else {
return Status::Corruption("bad WriteBatch Delete"); return Status::Corruption("bad WriteBatch Delete");
} }
break; break;
case kTypeColumnFamilyMerge:
if (!GetVarint32(&input, &column_family)) {
return Status::Corruption("bad WriteBatch Merge");
}
// intentional fallthrough
case kTypeMerge: case kTypeMerge:
if (GetLengthPrefixedSlice(&input, &key) && if (GetLengthPrefixedSlice(&input, &key) &&
GetLengthPrefixedSlice(&input, &value)) { GetLengthPrefixedSlice(&input, &value)) {
handler->Merge(key, value); s = handler->MergeCF(column_family, key, value);
found++; found++;
} else { } else {
return Status::Corruption("bad WriteBatch Merge"); return Status::Corruption("bad WriteBatch Merge");
@ -117,7 +147,10 @@ Status WriteBatch::Iterate(Handler* handler) const {
return Status::Corruption("unknown WriteBatch tag"); return Status::Corruption("unknown WriteBatch tag");
} }
} }
if (found != WriteBatchInternal::Count(this)) { if (!s.ok()) {
return s;
}
if (found != WriteBatchInternal::Count(this)) {
return Status::Corruption("WriteBatch has wrong count"); return Status::Corruption("WriteBatch has wrong count");
} else { } else {
return Status::OK(); return Status::OK();
@ -140,29 +173,76 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
EncodeFixed64(&b->rep_[0], seq); EncodeFixed64(&b->rep_[0], seq);
} }
void WriteBatch::Put(const Slice& key, const Slice& value) { void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) {
uint32_t column_family_id = 0;
if (column_family != nullptr) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
column_family_id = cfh->GetID();
}
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeValue)); if (column_family_id == 0) {
rep_.push_back(static_cast<char>(kTypeValue));
} else {
rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
PutVarint32(&rep_, column_family_id);
}
PutLengthPrefixedSlice(&rep_, key); PutLengthPrefixedSlice(&rep_, key);
PutLengthPrefixedSlice(&rep_, value); PutLengthPrefixedSlice(&rep_, value);
} }
void WriteBatch::Put(const SliceParts& key, const SliceParts& value) { void WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
const SliceParts& value) {
uint32_t column_family_id = 0;
if (column_family != nullptr) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
column_family_id = cfh->GetID();
}
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeValue)); if (column_family_id == 0) {
rep_.push_back(static_cast<char>(kTypeValue));
} else {
rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
PutVarint32(&rep_, column_family_id);
}
PutLengthPrefixedSliceParts(&rep_, key); PutLengthPrefixedSliceParts(&rep_, key);
PutLengthPrefixedSliceParts(&rep_, value); PutLengthPrefixedSliceParts(&rep_, value);
} }
void WriteBatch::Delete(const Slice& key) { void WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
uint32_t column_family_id = 0;
if (column_family != nullptr) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
column_family_id = cfh->GetID();
}
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeDeletion)); if (column_family_id == 0) {
rep_.push_back(static_cast<char>(kTypeDeletion));
} else {
rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
PutVarint32(&rep_, column_family_id);
}
PutLengthPrefixedSlice(&rep_, key); PutLengthPrefixedSlice(&rep_, key);
} }
void WriteBatch::Merge(const Slice& key, const Slice& value) { void WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) {
uint32_t column_family_id = 0;
if (column_family != nullptr) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
column_family_id = cfh->GetID();
}
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeMerge)); if (column_family_id == 0) {
rep_.push_back(static_cast<char>(kTypeMerge));
} else {
rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
PutVarint32(&rep_, column_family_id);
}
PutLengthPrefixedSlice(&rep_, key); PutLengthPrefixedSlice(&rep_, key);
PutLengthPrefixedSlice(&rep_, value); PutLengthPrefixedSlice(&rep_, value);
} }
@ -176,33 +256,70 @@ namespace {
class MemTableInserter : public WriteBatch::Handler { class MemTableInserter : public WriteBatch::Handler {
public: public:
SequenceNumber sequence_; SequenceNumber sequence_;
MemTable* mem_; ColumnFamilyMemTables* cf_mems_;
const Options* options_; bool recovery_;
uint64_t log_number_;
DBImpl* db_; DBImpl* db_;
const bool filter_deletes_; const bool dont_filter_deletes_;
MemTableInserter(SequenceNumber sequence, MemTable* mem, const Options* opts, MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems,
DB* db, const bool filter_deletes) bool recovery, uint64_t log_number, DB* db,
: sequence_(sequence), const bool dont_filter_deletes)
mem_(mem), : sequence_(sequence),
options_(opts), cf_mems_(cf_mems),
db_(reinterpret_cast<DBImpl*>(db)), recovery_(recovery),
filter_deletes_(filter_deletes) { log_number_(log_number),
assert(mem_); db_(reinterpret_cast<DBImpl*>(db)),
if (filter_deletes_) { dont_filter_deletes_(dont_filter_deletes) {
assert(options_); assert(cf_mems);
if (!dont_filter_deletes_) {
assert(db_); assert(db_);
} }
} }
virtual void Put(const Slice& key, const Slice& value) { bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
if (!options_->inplace_update_support) { bool found = cf_mems_->Seek(column_family_id);
mem_->Add(sequence_, kTypeValue, key, value); if (recovery_ && (!found || log_number_ < cf_mems_->GetLogNumber())) {
} else if (options_->inplace_callback == nullptr) { // if in recovery envoronment:
mem_->Update(sequence_, key, value); // * If column family was not found, it might mean that the WAL write
RecordTick(options_->statistics.get(), NUMBER_KEYS_UPDATED); // batch references to the column family that was dropped after the
// insert. We don't want to fail the whole write batch in that case -- we
// just ignore the update.
// * If log_number_ < cf_mems_->GetLogNumber(), this means that column
// family already contains updates from this log. We can't apply updates
// twice because of update-in-place or merge workloads -- ignore the
// update
*s = Status::OK();
return false;
}
if (!found) {
assert(!recovery_);
// If the column family was not found in non-recovery enviornment
// (client's write code-path), we have to fail the write and return
// the failure status to the client.
*s = Status::InvalidArgument(
"Invalid column family specified in write batch");
return false;
}
return true;
}
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value) {
Status seek_status;
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
++sequence_;
return seek_status;
}
MemTable* mem = cf_mems_->GetMemTable();
const Options* options = cf_mems_->GetOptions();
if (!options->inplace_update_support) {
mem->Add(sequence_, kTypeValue, key, value);
} else if (options->inplace_callback == nullptr) {
mem->Update(sequence_, key, value);
RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED);
} else { } else {
if (mem_->UpdateCallback(sequence_, key, value, *options_)) { if (mem->UpdateCallback(sequence_, key, value, *options)) {
} else { } else {
// key not found in memtable. Do sst get, update, add // key not found in memtable. Do sst get, update, add
SnapshotImpl read_from_snapshot; SnapshotImpl read_from_snapshot;
@ -212,21 +329,26 @@ class MemTableInserter : public WriteBatch::Handler {
std::string prev_value; std::string prev_value;
std::string merged_value; std::string merged_value;
Status s = db_->Get(ropts, key, &prev_value);
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
if (cf_handle == nullptr) {
cf_handle = db_->DefaultColumnFamily();
}
Status s = db_->Get(ropts, cf_handle, key, &prev_value);
char* prev_buffer = const_cast<char*>(prev_value.c_str()); char* prev_buffer = const_cast<char*>(prev_value.c_str());
uint32_t prev_size = prev_value.size(); uint32_t prev_size = prev_value.size();
auto status = auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr,
options_->inplace_callback(s.ok() ? prev_buffer: nullptr, s.ok() ? &prev_size : nullptr,
s.ok() ? &prev_size: nullptr, value, &merged_value);
value, &merged_value);
if (status == UpdateStatus::UPDATED_INPLACE) { if (status == UpdateStatus::UPDATED_INPLACE) {
// prev_value is updated in-place with final value. // prev_value is updated in-place with final value.
mem_->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size)); mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN); RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
} else if (status == UpdateStatus::UPDATED) { } else if (status == UpdateStatus::UPDATED) {
// merged_value contains the final value. // merged_value contains the final value.
mem_->Add(sequence_, kTypeValue, key, Slice(merged_value)); mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN); RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
} }
} }
} }
@ -234,19 +356,28 @@ class MemTableInserter : public WriteBatch::Handler {
// sequence number. Even if the update eventually fails and does not result // sequence number. Even if the update eventually fails and does not result
// in memtable add/update. // in memtable add/update.
sequence_++; sequence_++;
return Status::OK();
} }
virtual void Merge(const Slice& key, const Slice& value) { virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
const Slice& value) {
Status seek_status;
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
++sequence_;
return seek_status;
}
MemTable* mem = cf_mems_->GetMemTable();
const Options* options = cf_mems_->GetOptions();
bool perform_merge = false; bool perform_merge = false;
if (options_->max_successive_merges > 0 && db_ != nullptr) { if (options->max_successive_merges > 0 && db_ != nullptr) {
LookupKey lkey(key, sequence_); LookupKey lkey(key, sequence_);
// Count the number of successive merges at the head // Count the number of successive merges at the head
// of the key in the memtable // of the key in the memtable
size_t num_merges = mem_->CountSuccessiveMergeEntries(lkey); size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);
if (num_merges >= options_->max_successive_merges) { if (num_merges >= options->max_successive_merges) {
perform_merge = true; perform_merge = true;
} }
} }
@ -262,62 +393,78 @@ class MemTableInserter : public WriteBatch::Handler {
ReadOptions read_options; ReadOptions read_options;
read_options.snapshot = &read_from_snapshot; read_options.snapshot = &read_from_snapshot;
db_->Get(read_options, key, &get_value); auto cf_handle = cf_mems_->GetColumnFamilyHandle();
if (cf_handle == nullptr) {
cf_handle = db_->DefaultColumnFamily();
}
db_->Get(read_options, cf_handle, key, &get_value);
Slice get_value_slice = Slice(get_value); Slice get_value_slice = Slice(get_value);
// 2) Apply this merge // 2) Apply this merge
auto merge_operator = options_->merge_operator.get(); auto merge_operator = options->merge_operator.get();
assert(merge_operator); assert(merge_operator);
std::deque<std::string> operands; std::deque<std::string> operands;
operands.push_front(value.ToString()); operands.push_front(value.ToString());
std::string new_value; std::string new_value;
if (!merge_operator->FullMerge(key, if (!merge_operator->FullMerge(key, &get_value_slice, operands,
&get_value_slice, &new_value, options->info_log.get())) {
operands,
&new_value,
options_->info_log.get())) {
// Failed to merge! // Failed to merge!
RecordTick(options_->statistics.get(), NUMBER_MERGE_FAILURES); RecordTick(options->statistics.get(), NUMBER_MERGE_FAILURES);
// Store the delta in memtable // Store the delta in memtable
perform_merge = false; perform_merge = false;
} else { } else {
// 3) Add value to memtable // 3) Add value to memtable
mem_->Add(sequence_, kTypeValue, key, new_value); mem->Add(sequence_, kTypeValue, key, new_value);
} }
} }
if (!perform_merge) { if (!perform_merge) {
// Add merge operator to memtable // Add merge operator to memtable
mem_->Add(sequence_, kTypeMerge, key, value); mem->Add(sequence_, kTypeMerge, key, value);
} }
sequence_++; sequence_++;
return Status::OK();
} }
virtual void Delete(const Slice& key) {
if (filter_deletes_) { virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
Status seek_status;
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
++sequence_;
return seek_status;
}
MemTable* mem = cf_mems_->GetMemTable();
const Options* options = cf_mems_->GetOptions();
if (!dont_filter_deletes_ && options->filter_deletes) {
SnapshotImpl read_from_snapshot; SnapshotImpl read_from_snapshot;
read_from_snapshot.number_ = sequence_; read_from_snapshot.number_ = sequence_;
ReadOptions ropts; ReadOptions ropts;
ropts.snapshot = &read_from_snapshot; ropts.snapshot = &read_from_snapshot;
std::string value; std::string value;
if (!db_->KeyMayExist(ropts, key, &value)) { auto cf_handle = cf_mems_->GetColumnFamilyHandle();
RecordTick(options_->statistics.get(), NUMBER_FILTERED_DELETES); if (cf_handle == nullptr) {
return; cf_handle = db_->DefaultColumnFamily();
}
if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
RecordTick(options->statistics.get(), NUMBER_FILTERED_DELETES);
return Status::OK();
} }
} }
mem_->Add(sequence_, kTypeDeletion, key, Slice()); mem->Add(sequence_, kTypeDeletion, key, Slice());
sequence_++; sequence_++;
return Status::OK();
} }
}; };
} // namespace } // namespace
Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* mem, Status WriteBatchInternal::InsertInto(const WriteBatch* b,
const Options* opts, DB* db, ColumnFamilyMemTables* memtables,
const bool filter_deletes) { bool recovery, uint64_t log_number,
MemTableInserter inserter(WriteBatchInternal::Sequence(b), mem, opts, db, DB* db, const bool dont_filter_deletes) {
filter_deletes); MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables,
recovery, log_number, db, dont_filter_deletes);
return b->Iterate(&inserter); return b->Iterate(&inserter);
} }

@ -17,6 +17,49 @@ namespace rocksdb {
class MemTable; class MemTable;
class ColumnFamilyMemTables {
public:
virtual ~ColumnFamilyMemTables() {}
virtual bool Seek(uint32_t column_family_id) = 0;
// returns true if the update to memtable should be ignored
// (useful when recovering from log whose updates have already
// been processed)
virtual uint64_t GetLogNumber() const = 0;
virtual MemTable* GetMemTable() const = 0;
virtual const Options* GetOptions() const = 0;
virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
};
class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
public:
ColumnFamilyMemTablesDefault(MemTable* mem, const Options* options)
: ok_(false), mem_(mem), options_(options) {}
bool Seek(uint32_t column_family_id) override {
ok_ = (column_family_id == 0);
return ok_;
}
uint64_t GetLogNumber() const override { return 0; }
MemTable* GetMemTable() const override {
assert(ok_);
return mem_;
}
const Options* GetOptions() const override {
assert(ok_);
return options_;
}
ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
private:
bool ok_;
MemTable* mem_;
const Options* const options_;
};
// WriteBatchInternal provides static methods for manipulating a // WriteBatchInternal provides static methods for manipulating a
// WriteBatch that we don't want in the public WriteBatch interface. // WriteBatch that we don't want in the public WriteBatch interface.
class WriteBatchInternal { class WriteBatchInternal {
@ -45,11 +88,21 @@ class WriteBatchInternal {
static void SetContents(WriteBatch* batch, const Slice& contents); static void SetContents(WriteBatch* batch, const Slice& contents);
// Inserts batch entries into memtable // Inserts batch entries into memtable
// Drops deletes in batch if filter_del is set to true and // If dont_filter_deletes is false AND options.filter_deletes is true,
// db->KeyMayExist returns false // then --> Drops deletes in batch if db->KeyMayExist returns false
static Status InsertInto(const WriteBatch* batch, MemTable* memtable, // If recovery == true, this means InsertInto is executed on a recovery
const Options* opts, DB* db = nullptr, // code-path. WriteBatch referencing a dropped column family can be
const bool filter_del = false); // found on a recovery code-path and should be ignored (recovery should not
// fail). Additionally, the memtable will be updated only if
// memtables->GetLogNumber() >= log_number
// However, if recovery == false, any WriteBatch referencing
// non-existing column family will return a failure. Also, log_number is
// ignored in that case
static Status InsertInto(const WriteBatch* batch,
ColumnFamilyMemTables* memtables,
bool recovery = false, uint64_t log_number = 0,
DB* db = nullptr,
const bool dont_filter_deletes = true);
static void Append(WriteBatch* dst, const WriteBatch* src); static void Append(WriteBatch* dst, const WriteBatch* src);
}; };

@ -11,6 +11,7 @@
#include <memory> #include <memory>
#include "db/memtable.h" #include "db/memtable.h"
#include "db/column_family.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/memtablerep.h" #include "rocksdb/memtablerep.h"
@ -27,7 +28,8 @@ static std::string PrintContents(WriteBatch* b) {
MemTable* mem = new MemTable(cmp, options); MemTable* mem = new MemTable(cmp, options);
mem->Ref(); mem->Ref();
std::string state; std::string state;
Status s = WriteBatchInternal::InsertInto(b, mem, &options); ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default);
int count = 0; int count = 0;
Iterator* iter = mem->NewIterator(); Iterator* iter = mem->NewIterator();
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@ -144,17 +146,37 @@ TEST(WriteBatchTest, Append) {
namespace { namespace {
struct TestHandler : public WriteBatch::Handler { struct TestHandler : public WriteBatch::Handler {
std::string seen; std::string seen;
virtual void Put(const Slice& key, const Slice& value) { virtual Status PutCF(uint32_t column_family_id, const Slice& key,
seen += "Put(" + key.ToString() + ", " + value.ToString() + ")"; const Slice& value) {
if (column_family_id == 0) {
seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
} else {
seen += "PutCF(" + std::to_string(column_family_id) + ", " +
key.ToString() + ", " + value.ToString() + ")";
}
return Status::OK();
} }
virtual void Merge(const Slice& key, const Slice& value) { virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")"; const Slice& value) {
if (column_family_id == 0) {
seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
} else {
seen += "MergeCF(" + std::to_string(column_family_id) + ", " +
key.ToString() + ", " + value.ToString() + ")";
}
return Status::OK();
} }
virtual void LogData(const Slice& blob) { virtual void LogData(const Slice& blob) {
seen += "LogData(" + blob.ToString() + ")"; seen += "LogData(" + blob.ToString() + ")";
} }
virtual void Delete(const Slice& key) { virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
seen += "Delete(" + key.ToString() + ")"; if (column_family_id == 0) {
seen += "Delete(" + key.ToString() + ")";
} else {
seen += "DeleteCF(" + std::to_string(column_family_id) + ", " +
key.ToString() + ")";
}
return Status::OK();
} }
}; };
} }
@ -194,21 +216,23 @@ TEST(WriteBatchTest, Continue) {
struct Handler : public TestHandler { struct Handler : public TestHandler {
int num_seen = 0; int num_seen = 0;
virtual void Put(const Slice& key, const Slice& value) { virtual Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value) {
++num_seen; ++num_seen;
TestHandler::Put(key, value); return TestHandler::PutCF(column_family_id, key, value);
} }
virtual void Merge(const Slice& key, const Slice& value) { virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
const Slice& value) {
++num_seen; ++num_seen;
TestHandler::Merge(key, value); return TestHandler::MergeCF(column_family_id, key, value);
} }
virtual void LogData(const Slice& blob) { virtual void LogData(const Slice& blob) {
++num_seen; ++num_seen;
TestHandler::LogData(blob); TestHandler::LogData(blob);
} }
virtual void Delete(const Slice& key) { virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
++num_seen; ++num_seen;
TestHandler::Delete(key); return TestHandler::DeleteCF(column_family_id, key);
} }
virtual bool Continue() override { virtual bool Continue() override {
return num_seen < 3; return num_seen < 3;
@ -256,6 +280,42 @@ TEST(WriteBatchTest, PutGatherSlices) {
ASSERT_EQ(3, batch.Count()); ASSERT_EQ(3, batch.Count());
} }
namespace {
class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
public:
ColumnFamilyHandleImplDummy(int id)
: ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
uint32_t GetID() const override { return id_; }
private:
uint32_t id_;
};
} // namespace anonymous
TEST(WriteBatchTest, ColumnFamiliesBatchTest) {
WriteBatch batch;
ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
batch.Put(&zero, Slice("foo"), Slice("bar"));
batch.Put(&two, Slice("twofoo"), Slice("bar2"));
batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
batch.Delete(&eight, Slice("eightfoo"));
batch.Merge(&three, Slice("threethree"), Slice("3three"));
batch.Put(&zero, Slice("foo"), Slice("bar"));
batch.Merge(Slice("omom"), Slice("nom"));
TestHandler handler;
batch.Iterate(&handler);
ASSERT_EQ(
"Put(foo, bar)"
"PutCF(2, twofoo, bar2)"
"PutCF(8, eightfoo, bar8)"
"DeleteCF(8, eightfoo)"
"MergeCF(3, threethree, 3three)"
"Put(foo, bar)"
"Merge(omom, nom)",
handler.seen);
}
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -243,6 +243,7 @@ extern void rocksdb_options_set_paranoid_checks(
rocksdb_options_t*, unsigned char); rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*); extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*);
extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*); extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int);
extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t); extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int); extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*); extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*);
@ -275,6 +276,8 @@ extern void rocksdb_options_set_expanded_compaction_factor(
rocksdb_options_t*, int); rocksdb_options_t*, int);
extern void rocksdb_options_set_max_grandparent_overlap_factor( extern void rocksdb_options_set_max_grandparent_overlap_factor(
rocksdb_options_t*, int); rocksdb_options_t*, int);
extern void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
rocksdb_options_t*, int* level_values, size_t num_levels);
extern void rocksdb_options_enable_statistics(rocksdb_options_t*); extern void rocksdb_options_enable_statistics(rocksdb_options_t*);
extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int); extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int);
@ -330,10 +333,14 @@ extern void rocksdb_options_set_block_size_deviation(
rocksdb_options_t*, int); rocksdb_options_t*, int);
extern void rocksdb_options_set_advise_random_on_open( extern void rocksdb_options_set_advise_random_on_open(
rocksdb_options_t*, unsigned char); rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_access_hint_on_compaction_start(
rocksdb_options_t*, int);
extern void rocksdb_options_set_use_adaptive_mutex( extern void rocksdb_options_set_use_adaptive_mutex(
rocksdb_options_t*, unsigned char); rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_bytes_per_sync( extern void rocksdb_options_set_bytes_per_sync(
rocksdb_options_t*, uint64_t); rocksdb_options_t*, uint64_t);
extern void rocksdb_options_set_verify_checksums_in_compaction(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_filter_deletes( extern void rocksdb_options_set_filter_deletes(
rocksdb_options_t*, unsigned char); rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_max_sequential_skip_in_iterations( extern void rocksdb_options_set_max_sequential_skip_in_iterations(
@ -348,6 +355,7 @@ extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*);
extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*); extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*);
extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, int32_t, int32_t); extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, int32_t, int32_t);
extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t); extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t);
extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t);
extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n); extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n);
extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec); extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec);
@ -360,6 +368,16 @@ extern void rocksdb_options_set_memtable_prefix_bloom_probes(
rocksdb_options_t*, uint32_t); rocksdb_options_t*, uint32_t);
extern void rocksdb_options_set_max_successive_merges( extern void rocksdb_options_set_max_successive_merges(
rocksdb_options_t*, size_t); rocksdb_options_t*, size_t);
extern void rocksdb_options_set_min_partial_merge_operands(
rocksdb_options_t*, uint32_t);
extern void rocksdb_options_set_bloom_locality(
rocksdb_options_t*, uint32_t);
extern void rocksdb_options_set_allow_thread_local(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_inplace_update_support(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_inplace_update_num_locks(
rocksdb_options_t*, size_t);
enum { enum {
rocksdb_no_compression = 0, rocksdb_no_compression = 0,

@ -13,6 +13,7 @@
#include <stdio.h> #include <stdio.h>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include <string>
#include <unordered_map> #include <unordered_map>
#include "rocksdb/iterator.h" #include "rocksdb/iterator.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
@ -23,8 +24,24 @@ namespace rocksdb {
using std::unique_ptr; using std::unique_ptr;
class ColumnFamilyHandle {
public:
virtual ~ColumnFamilyHandle() {}
};
extern const std::string default_column_family_name;
struct ColumnFamilyDescriptor {
std::string name;
ColumnFamilyOptions options;
ColumnFamilyDescriptor()
: name(default_column_family_name), options(ColumnFamilyOptions()) {}
ColumnFamilyDescriptor(const std::string& name,
const ColumnFamilyOptions& options)
: name(name), options(options) {}
};
// Update Makefile if you change these // Update Makefile if you change these
static const int kMajorVersion = 2; static const int kMajorVersion = 3;
static const int kMinorVersion = 0; static const int kMinorVersion = 0;
struct Options; struct Options;
@ -87,33 +104,80 @@ class DB {
// that modify data, like put/delete, will return error. // that modify data, like put/delete, will return error.
// If the db is opened in read only mode, then no compactions // If the db is opened in read only mode, then no compactions
// will happen. // will happen.
// TODO(icanadi): implement OpenForReadOnly that specifies column families.
// User can open DB in read-only mode even if not specifying all column
// families
static Status OpenForReadOnly(const Options& options, static Status OpenForReadOnly(const Options& options,
const std::string& name, DB** dbptr, const std::string& name, DB** dbptr,
bool error_if_log_file_exist = false); bool error_if_log_file_exist = false);
// Open DB with column families.
// db_options specify database specific options
// column_families is the vector of all column families you'd like to open,
// containing column family name and options. The default column family name
// is 'default'.
// If everything is OK, handles will on return be the same size
// as column_families --- handles[i] will be a handle that you
// will use to operate on column family column_family[i]
static Status Open(const DBOptions& db_options, const std::string& name,
const std::vector<ColumnFamilyDescriptor>& column_families,
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
// ListColumnFamilies will open the DB specified by argument name
// and return the list of all column families in that DB
// through column_families argument. The ordering of
// column families in column_families is unspecified.
static Status ListColumnFamilies(const DBOptions& db_options,
const std::string& name,
std::vector<std::string>* column_families);
DB() { } DB() { }
virtual ~DB(); virtual ~DB();
// Create a column_family and return the handle of column family
// through the argument handle.
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
const std::string& column_family_name,
ColumnFamilyHandle** handle);
// Drop a column family specified by column_family handle. This call
// only records a drop record in the manifest and prevents the column
// family from flushing and compacting.
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
// Set the database entry for "key" to "value". // Set the database entry for "key" to "value".
// Returns OK on success, and a non-OK status on error. // Returns OK on success, and a non-OK status on error.
// Note: consider setting options.sync = true. // Note: consider setting options.sync = true.
virtual Status Put(const WriteOptions& options, virtual Status Put(const WriteOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) = 0; const Slice& value) = 0;
Status Put(const WriteOptions& options, const Slice& key,
const Slice& value) {
return Put(options, DefaultColumnFamily(), key, value);
}
// Remove the database entry (if any) for "key". Returns OK on // Remove the database entry (if any) for "key". Returns OK on
// success, and a non-OK status on error. It is not an error if "key" // success, and a non-OK status on error. It is not an error if "key"
// did not exist in the database. // did not exist in the database.
// Note: consider setting options.sync = true. // Note: consider setting options.sync = true.
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; virtual Status Delete(const WriteOptions& options,
ColumnFamilyHandle* column_family,
const Slice& key) = 0;
Status Delete(const WriteOptions& options, const Slice& key) {
return Delete(options, DefaultColumnFamily(), key);
}
// Merge the database entry for "key" with "value". Returns OK on success, // Merge the database entry for "key" with "value". Returns OK on success,
// and a non-OK status on error. The semantics of this operation is // and a non-OK status on error. The semantics of this operation is
// determined by the user provided merge_operator when opening DB. // determined by the user provided merge_operator when opening DB.
// Note: consider setting options.sync = true. // Note: consider setting options.sync = true.
virtual Status Merge(const WriteOptions& options, virtual Status Merge(const WriteOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) = 0; const Slice& value) = 0;
Status Merge(const WriteOptions& options, const Slice& key,
const Slice& value) {
return Merge(options, DefaultColumnFamily(), key, value);
}
// Apply the specified updates to the database. // Apply the specified updates to the database.
// Returns OK on success, non-OK on failure. // Returns OK on success, non-OK on failure.
@ -128,8 +192,11 @@ class DB {
// //
// May return some other Status on an error. // May return some other Status on an error.
virtual Status Get(const ReadOptions& options, virtual Status Get(const ReadOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) = 0; std::string* value) = 0;
Status Get(const ReadOptions& options, const Slice& key, std::string* value) {
return Get(options, DefaultColumnFamily(), key, value);
}
// If keys[i] does not exist in the database, then the i'th returned // If keys[i] does not exist in the database, then the i'th returned
// status will be one for which Status::IsNotFound() is true, and // status will be one for which Status::IsNotFound() is true, and
@ -141,9 +208,17 @@ class DB {
// Similarly, the number of returned statuses will be the number of keys. // Similarly, the number of returned statuses will be the number of keys.
// Note: keys will not be "de-duplicated". Duplicate keys will return // Note: keys will not be "de-duplicated". Duplicate keys will return
// duplicate values in order. // duplicate values in order.
virtual std::vector<Status> MultiGet(const ReadOptions& options, virtual std::vector<Status> MultiGet(
const std::vector<Slice>& keys, const ReadOptions& options,
std::vector<std::string>* values) = 0; const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
std::vector<Status> MultiGet(const ReadOptions& options,
const std::vector<Slice>& keys,
std::vector<std::string>* values) {
return MultiGet(options, std::vector<ColumnFamilyHandle*>(
keys.size(), DefaultColumnFamily()),
keys, values);
}
// If the key definitely does not exist in the database, then this method // If the key definitely does not exist in the database, then this method
// returns false, else true. If the caller wants to obtain value when the key // returns false, else true. If the caller wants to obtain value when the key
@ -153,14 +228,17 @@ class DB {
// to make this lighter weight is to avoid doing any IOs. // to make this lighter weight is to avoid doing any IOs.
// Default implementation here returns true and sets 'value_found' to false // Default implementation here returns true and sets 'value_found' to false
virtual bool KeyMayExist(const ReadOptions& options, virtual bool KeyMayExist(const ReadOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
std::string* value, std::string* value, bool* value_found = nullptr) {
bool* value_found = nullptr) {
if (value_found != nullptr) { if (value_found != nullptr) {
*value_found = false; *value_found = false;
} }
return true; return true;
} }
bool KeyMayExist(const ReadOptions& options, const Slice& key,
std::string* value, bool* value_found = nullptr) {
return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
}
// Return a heap-allocated iterator over the contents of the database. // Return a heap-allocated iterator over the contents of the database.
// The result of NewIterator() is initially invalid (caller must // The result of NewIterator() is initially invalid (caller must
@ -168,7 +246,18 @@ class DB {
// //
// Caller should delete the iterator when it is no longer needed. // Caller should delete the iterator when it is no longer needed.
// The returned iterator should be deleted before this db is deleted. // The returned iterator should be deleted before this db is deleted.
virtual Iterator* NewIterator(const ReadOptions& options) = 0; virtual Iterator* NewIterator(const ReadOptions& options,
ColumnFamilyHandle* column_family) = 0;
Iterator* NewIterator(const ReadOptions& options) {
return NewIterator(options, DefaultColumnFamily());
}
// Returns iterators from a consistent database state across multiple
// column families. Iterators are heap allocated and need to be deleted
// before the db is deleted
virtual Status NewIterators(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_families,
std::vector<Iterator*>* iterators) = 0;
// Return a handle to the current DB state. Iterators created with // Return a handle to the current DB state. Iterators created with
// this handle will all observe a stable snapshot of the current DB // this handle will all observe a stable snapshot of the current DB
@ -194,7 +283,11 @@ class DB {
// about the internal operation of the DB. // about the internal operation of the DB.
// "rocksdb.sstables" - returns a multi-line string that describes all // "rocksdb.sstables" - returns a multi-line string that describes all
// of the sstables that make up the db contents. // of the sstables that make up the db contents.
virtual bool GetProperty(const Slice& property, std::string* value) = 0; virtual bool GetProperty(ColumnFamilyHandle* column_family,
const Slice& property, std::string* value) = 0;
bool GetProperty(const Slice& property, std::string* value) {
return GetProperty(DefaultColumnFamily(), property, value);
}
// For each i in [0,n-1], store in "sizes[i]", the approximate // For each i in [0,n-1], store in "sizes[i]", the approximate
// file system space used by keys in "[range[i].start .. range[i].limit)". // file system space used by keys in "[range[i].start .. range[i].limit)".
@ -204,8 +297,12 @@ class DB {
// sizes will be one-tenth the size of the corresponding user data size. // sizes will be one-tenth the size of the corresponding user data size.
// //
// The results may not include the sizes of recently written data. // The results may not include the sizes of recently written data.
virtual void GetApproximateSizes(const Range* range, int n, virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
const Range* range, int n,
uint64_t* sizes) = 0; uint64_t* sizes) = 0;
void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) {
GetApproximateSizes(DefaultColumnFamily(), range, n, sizes);
}
// Compact the underlying storage for the key range [*begin,*end]. // Compact the underlying storage for the key range [*begin,*end].
// The actual compaction interval might be superset of [*begin, *end]. // The actual compaction interval might be superset of [*begin, *end].
@ -224,19 +321,32 @@ class DB {
// hosting all the files. In this case, client could set reduce_level // hosting all the files. In this case, client could set reduce_level
// to true, to move the files back to the minimum level capable of holding // to true, to move the files back to the minimum level capable of holding
// the data set or a given level (specified by non-negative target_level). // the data set or a given level (specified by non-negative target_level).
virtual Status CompactRange(const Slice* begin, const Slice* end, virtual Status CompactRange(ColumnFamilyHandle* column_family,
const Slice* begin, const Slice* end,
bool reduce_level = false, bool reduce_level = false,
int target_level = -1) = 0; int target_level = -1) = 0;
Status CompactRange(const Slice* begin, const Slice* end,
bool reduce_level = false, int target_level = -1) {
return CompactRange(DefaultColumnFamily(), begin, end, reduce_level,
target_level);
}
// Number of levels used for this DB. // Number of levels used for this DB.
virtual int NumberLevels() = 0; virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
// Maximum level to which a new compacted memtable is pushed if it // Maximum level to which a new compacted memtable is pushed if it
// does not create overlap. // does not create overlap.
virtual int MaxMemCompactionLevel() = 0; virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
int MaxMemCompactionLevel() {
return MaxMemCompactionLevel(DefaultColumnFamily());
}
// Number of files in level-0 that would stop writes. // Number of files in level-0 that would stop writes.
virtual int Level0StopWriteTrigger() = 0; virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
int Level0StopWriteTrigger() {
return Level0StopWriteTrigger(DefaultColumnFamily());
}
// Get DB name -- the exact same name that was provided as an argument to // Get DB name -- the exact same name that was provided as an argument to
// DB::Open() // DB::Open()
@ -246,10 +356,18 @@ class DB {
virtual Env* GetEnv() const = 0; virtual Env* GetEnv() const = 0;
// Get DB Options that we use // Get DB Options that we use
virtual const Options& GetOptions() const = 0; virtual const Options& GetOptions(ColumnFamilyHandle* column_family)
const = 0;
const Options& GetOptions() const {
return GetOptions(DefaultColumnFamily());
}
// Flush all mem-table data. // Flush all mem-table data.
virtual Status Flush(const FlushOptions& options) = 0; virtual Status Flush(const FlushOptions& options,
ColumnFamilyHandle* column_family) = 0;
Status Flush(const FlushOptions& options) {
return Flush(options, DefaultColumnFamily());
}
// Prevent file deletions. Compactions will continue to occur, // Prevent file deletions. Compactions will continue to occur,
// but no obsolete files will be deleted. Calling this multiple // but no obsolete files will be deleted. Calling this multiple
@ -279,9 +397,12 @@ class DB {
// Setting flush_memtable to true does Flush before recording the live files. // Setting flush_memtable to true does Flush before recording the live files.
// Setting flush_memtable to false is useful when we don't want to wait for // Setting flush_memtable to false is useful when we don't want to wait for
// flush which may have to wait for compaction to complete taking an // flush which may have to wait for compaction to complete taking an
// indeterminate time. But this will have to use GetSortedWalFiles after // indeterminate time.
// GetLiveFiles to compensate for memtables missed in this snapshot due to the //
// absence of Flush, by WAL files to recover the database consistently later // In case you have multiple column families, even if flush_memtable is true,
// you still need to call GetSortedWalFiles after GetLiveFiles to compensate
// for new data that arrived to already-flushed column families while other
// column families were flushing
virtual Status GetLiveFiles(std::vector<std::string>&, virtual Status GetLiveFiles(std::vector<std::string>&,
uint64_t* manifest_file_size, uint64_t* manifest_file_size,
bool flush_memtable = true) = 0; bool flush_memtable = true) = 0;
@ -319,7 +440,14 @@ class DB {
// be set properly // be set properly
virtual Status GetDbIdentity(std::string& identity) = 0; virtual Status GetDbIdentity(std::string& identity) = 0;
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) = 0; // Returns default column family handle
virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
TablePropertiesCollection* props) = 0;
Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
}
private: private:
// No copying allowed // No copying allowed

@ -34,7 +34,7 @@ class Slice;
class WritableFile; class WritableFile;
class RandomRWFile; class RandomRWFile;
class Directory; class Directory;
struct Options; struct DBOptions;
using std::unique_ptr; using std::unique_ptr;
using std::shared_ptr; using std::shared_ptr;
@ -47,7 +47,7 @@ struct EnvOptions {
EnvOptions(); EnvOptions();
// construct from Options // construct from Options
explicit EnvOptions(const Options& options); explicit EnvOptions(const DBOptions& options);
// If true, then allow caching of data in environment buffers // If true, then allow caching of data in environment buffers
bool use_os_buffer = true; bool use_os_buffer = true;

@ -45,6 +45,8 @@ class LookupKey;
class Slice; class Slice;
class SliceTransform; class SliceTransform;
typedef void* KeyHandle;
class MemTableRep { class MemTableRep {
public: public:
// KeyComparator provides a means to compare keys, which are internal keys // KeyComparator provides a means to compare keys, which are internal keys
@ -62,11 +64,19 @@ class MemTableRep {
virtual ~KeyComparator() { } virtual ~KeyComparator() { }
}; };
explicit MemTableRep(Arena* arena) : arena_(arena) {}
// Allocate a buf of len size for storing key. The idea is that a specific
// memtable representation knows its underlying data structure better. By
// allowing it to allocate memory, it can possibly put correlated stuff
// in consecutive memory area to make processor prefetching more efficient.
virtual KeyHandle Allocate(const size_t len, char** buf);
// Insert key into the collection. (The caller will pack key and value into a // Insert key into the collection. (The caller will pack key and value into a
// single buffer and pass that in as the parameter to Insert) // single buffer and pass that in as the parameter to Insert).
// REQUIRES: nothing that compares equal to key is currently in the // REQUIRES: nothing that compares equal to key is currently in the
// collection. // collection.
virtual void Insert(const char* key) = 0; virtual void Insert(KeyHandle handle) = 0;
// Returns true iff an entry that compares equal to key is in the collection. // Returns true iff an entry that compares equal to key is in the collection.
virtual bool Contains(const char* key) const = 0; virtual bool Contains(const char* key) const = 0;
@ -153,6 +163,8 @@ class MemTableRep {
// When *key is an internal key concatenated with the value, returns the // When *key is an internal key concatenated with the value, returns the
// user key. // user key.
virtual Slice UserKey(const char* key) const; virtual Slice UserKey(const char* key) const;
Arena* arena_;
}; };
// This is the base class for all factories that are used by RocksDB to create // This is the base class for all factories that are used by RocksDB to create

@ -72,8 +72,9 @@ enum UpdateStatus { // Return status For inplace update callback
UPDATED = 2, // No inplace update. Merged value set UPDATED = 2, // No inplace update. Merged value set
}; };
// Options to control the behavior of a database (passed to DB::Open) struct Options;
struct Options {
struct ColumnFamilyOptions {
// ------------------- // -------------------
// Parameters that affect behavior // Parameters that affect behavior
@ -130,38 +131,6 @@ struct Options {
// Default: a factory that doesn't provide any object // Default: a factory that doesn't provide any object
std::shared_ptr<CompactionFilterFactoryV2> compaction_filter_factory_v2; std::shared_ptr<CompactionFilterFactoryV2> compaction_filter_factory_v2;
// If true, the database will be created if it is missing.
// Default: false
bool create_if_missing;
// If true, an error is raised if the database already exists.
// Default: false
bool error_if_exists;
// If true, the implementation will do aggressive checking of the
// data it is processing and will stop early if it detects any
// errors. This may have unforeseen ramifications: for example, a
// corruption of one DB entry may cause a large number of entries to
// become unreadable or for the entire DB to become unopenable.
// If any of the writes to the database fails (Put, Delete, Merge, Write),
// the database will switch to read-only mode and fail all other
// Write operations.
// Default: true
bool paranoid_checks;
// Use the specified object to interact with the environment,
// e.g. to read/write files, schedule background work, etc.
// Default: Env::Default()
Env* env;
// Any internal progress/error information generated by the db will
// be written to info_log if it is non-nullptr, or to a file stored
// in the same directory as the DB contents if info_log is nullptr.
// Default: nullptr
shared_ptr<Logger> info_log;
InfoLogLevel info_log_level;
// ------------------- // -------------------
// Parameters that affect performance // Parameters that affect performance
@ -193,15 +162,6 @@ struct Options {
// individual write buffers. Default: 1 // individual write buffers. Default: 1
int min_write_buffer_number_to_merge; int min_write_buffer_number_to_merge;
// Number of open files that can be used by the DB. You may need to
// increase this if your database has a large working set. Value -1 means
// files opened are always kept open. You can estimate number of files based
// on target_file_size_base and target_file_size_multiplier for level-based
// compaction. For universal-style compaction, you can usually set it to -1.
//
// Default: 5000
int max_open_files;
// Control over blocks (user data is stored in a set of blocks, and // Control over blocks (user data is stored in a set of blocks, and
// a block is the unit of reading from disk). // a block is the unit of reading from disk).
@ -369,93 +329,12 @@ struct Options {
// stop building a single file in a level->level+1 compaction. // stop building a single file in a level->level+1 compaction.
int max_grandparent_overlap_factor; int max_grandparent_overlap_factor;
// If non-null, then we should collect metrics about database operations
// Statistics objects should not be shared between DB instances as
// it does not use any locks to prevent concurrent updates.
shared_ptr<Statistics> statistics;
// If true, then the contents of data files are not synced
// to stable storage. Their contents remain in the OS buffers till the
// OS decides to flush them. This option is good for bulk-loading
// of data. Once the bulk-loading is complete, please issue a
// sync to the OS to flush all dirty buffesrs to stable storage.
// Default: false
bool disableDataSync;
// If true, then every store to stable storage will issue a fsync.
// If false, then every store to stable storage will issue a fdatasync.
// This parameter should be set to true while storing data to
// filesystem like ext3 that can lose files after a reboot.
// Default: false
bool use_fsync;
// This number controls how often a new scribe log about
// db deploy stats is written out.
// -1 indicates no logging at all.
// Default value is 1800 (half an hour).
int db_stats_log_interval;
// This specifies the info LOG dir.
// If it is empty, the log files will be in the same dir as data.
// If it is non empty, the log files will be in the specified dir,
// and the db data dir's absolute path will be used as the log file
// name's prefix.
std::string db_log_dir;
// This specifies the absolute dir path for write-ahead logs (WAL).
// If it is empty, the log files will be in the same dir as data,
// dbname is used as the data dir by default
// If it is non empty, the log files will be in kept the specified dir.
// When destroying the db,
// all log files in wal_dir and the dir itself is deleted
std::string wal_dir;
// Disable compaction triggered by seek. // Disable compaction triggered by seek.
// With bloomfilter and fast storage, a miss on one level // With bloomfilter and fast storage, a miss on one level
// is very cheap if the file handle is cached in table cache // is very cheap if the file handle is cached in table cache
// (which is true if max_open_files is large). // (which is true if max_open_files is large).
bool disable_seek_compaction; bool disable_seek_compaction;
// The periodicity when obsolete files get deleted. The default
// value is 6 hours. The files that get out of scope by compaction
// process will still get automatically delete on every compaction,
// regardless of this setting
uint64_t delete_obsolete_files_period_micros;
// Maximum number of concurrent background jobs, submitted to
// the default LOW priority thread pool
// Default: 1
int max_background_compactions;
// Maximum number of concurrent background memtable flush jobs, submitted to
// the HIGH priority thread pool.
// By default, all background jobs (major compaction and memtable flush) go
// to the LOW priority pool. If this option is set to a positive number,
// memtable flush jobs will be submitted to the HIGH priority pool.
// It is important when the same Env is shared by multiple db instances.
// Without a separate pool, long running major compaction jobs could
// potentially block memtable flush jobs of other db instances, leading to
// unnecessary Put stalls.
// Default: 1
int max_background_flushes;
// Specify the maximal size of the info log file. If the log file
// is larger than `max_log_file_size`, a new info log file will
// be created.
// If max_log_file_size == 0, all logs will be written to one
// log file.
size_t max_log_file_size;
// Time for the info log file to roll (in seconds).
// If specified with non-zero value, log file will be rolled
// if it has been active longer than `log_file_time_to_roll`.
// Default: 0 (disabled)
size_t log_file_time_to_roll;
// Maximal info log files to be kept.
// Default: 1000
size_t keep_log_file_num;
// Puts are delayed 0-1 ms when any level has a compaction score that exceeds // Puts are delayed 0-1 ms when any level has a compaction score that exceeds
// soft_rate_limit. This is ignored when == 0.0. // soft_rate_limit. This is ignored when == 0.0.
// CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
@ -473,32 +352,14 @@ struct Options {
// Default: 1000 // Default: 1000
unsigned int rate_limit_delay_max_milliseconds; unsigned int rate_limit_delay_max_milliseconds;
// manifest file is rolled over on reaching this limit.
// The older manifest file be deleted.
// The default value is MAX_INT so that roll-over does not take place.
uint64_t max_manifest_file_size;
// Disable block cache. If this is set to true, // Disable block cache. If this is set to true,
// then no block cache should be used, and the block_cache should // then no block cache should be used, and the block_cache should
// point to a nullptr object. // point to a nullptr object.
// Default: false // Default: false
bool no_block_cache; bool no_block_cache;
// Number of shards used for table cache. // size of one block in arena memory allocation.
int table_cache_numshardbits; // If <= 0, a proper value is automatically calculated (usually 1/10 of
// During data eviction of table's LRU cache, it would be inefficient
// to strictly follow LRU because this piece of memory will not really
// be released unless its refcount falls to zero. Instead, make two
// passes: the first pass will release items with refcount = 1,
// and if not enough space releases after scanning the number of
// elements specified by this parameter, we will remove items in LRU
// order.
int table_cache_remove_scan_count_limit;
// Size of one block in arena memory allocation.
//
// If <= 0, a proper value is automatically calculated (usually about 1/10 of
// writer_buffer_size). // writer_buffer_size).
// //
// There are two additonal restriction of the The specified size: // There are two additonal restriction of the The specified size:
@ -512,71 +373,14 @@ struct Options {
// Default: 0 // Default: 0
size_t arena_block_size; size_t arena_block_size;
// Create an Options object with default values for all fields.
Options();
void Dump(Logger* log) const;
// Set appropriate parameters for bulk loading.
// The reason that this is a function that returns "this" instead of a
// constructor is to enable chaining of multiple similar calls in the future.
//
// All data will be in level 0 without any automatic compaction.
// It's recommended to manually call CompactRange(NULL, NULL) before reading
// from the database, because otherwise the read can be very slow.
Options* PrepareForBulkLoad();
// Disable automatic compactions. Manual compactions can still // Disable automatic compactions. Manual compactions can still
// be issued on this database. // be issued on this column family
bool disable_auto_compactions; bool disable_auto_compactions;
// The following two fields affect how archived logs will be deleted.
// 1. If both set to 0, logs will be deleted asap and will not get into
// the archive.
// 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
// WAL files will be checked every 10 min and if total size is greater
// then WAL_size_limit_MB, they will be deleted starting with the
// earliest until size_limit is met. All empty files will be deleted.
// 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
// WAL files will be checked every WAL_ttl_secondsi / 2 and those that
// are older than WAL_ttl_seconds will be deleted.
// 4. If both are not 0, WAL files will be checked every 10 min and both
// checks will be performed with ttl being first.
uint64_t WAL_ttl_seconds;
uint64_t WAL_size_limit_MB;
// Number of bytes to preallocate (via fallocate) the manifest
// files. Default is 4mb, which is reasonable to reduce random IO
// as well as prevent overallocation for mounts that preallocate
// large amounts of data (such as xfs's allocsize option).
size_t manifest_preallocation_size;
// Purge duplicate/deleted keys when a memtable is flushed to storage. // Purge duplicate/deleted keys when a memtable is flushed to storage.
// Default: true // Default: true
bool purge_redundant_kvs_while_flush; bool purge_redundant_kvs_while_flush;
// Data being read from file storage may be buffered in the OS
// Default: true
bool allow_os_buffer;
// Allow the OS to mmap file for reading sst tables. Default: false
bool allow_mmap_reads;
// Allow the OS to mmap file for writing. Default: false
bool allow_mmap_writes;
// Disable child process inherit open files. Default: true
bool is_fd_close_on_exec;
// Skip log corruption error on recovery (If client is ok with
// losing most recent changes)
// Default: false
bool skip_log_error_on_recovery;
// if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
// Default: 3600 (1 hour)
unsigned int stats_dump_period_sec;
// This is used to close a block before it reaches the configured // This is used to close a block before it reaches the configured
// 'block_size'. If the percentage of free space in the current block is less // 'block_size'. If the percentage of free space in the current block is less
// than this specified number and adding a new record to the block will // than this specified number and adding a new record to the block will
@ -585,45 +389,17 @@ struct Options {
// Default is 10. // Default is 10.
int block_size_deviation; int block_size_deviation;
// If set true, will hint the underlying file system that the file
// access pattern is random, when a sst file is opened.
// Default: true
bool advise_random_on_open;
// Specify the file access pattern once a compaction is started.
// It will be applied to all input files of a compaction.
// Default: NORMAL
enum {
NONE,
NORMAL,
SEQUENTIAL,
WILLNEED
} access_hint_on_compaction_start;
// Use adaptive mutex, which spins in the user space before resorting
// to kernel. This could reduce context switch when the mutex is not
// heavily contended. However, if the mutex is hot, we could end up
// wasting spin time.
// Default: false
bool use_adaptive_mutex;
// Allows OS to incrementally sync files to disk while they are being
// written, asynchronously, in the background.
// Issue one request for every bytes_per_sync written. 0 turns it off.
// Default: 0
uint64_t bytes_per_sync;
// The compaction style. Default: kCompactionStyleLevel // The compaction style. Default: kCompactionStyleLevel
CompactionStyle compaction_style; CompactionStyle compaction_style;
// The options needed to support Universal Style compactions
CompactionOptionsUniversal compaction_options_universal;
// If true, compaction will verify checksum on every read that happens // If true, compaction will verify checksum on every read that happens
// as part of compaction // as part of compaction
// Default: true // Default: true
bool verify_checksums_in_compaction; bool verify_checksums_in_compaction;
// The options needed to support Universal Style compactions
CompactionOptionsUniversal compaction_options_universal;
// Use KeyMayExist API to filter deletes when this is true. // Use KeyMayExist API to filter deletes when this is true.
// If KeyMayExist returns false, i.e. the key definitely does not exist, then // If KeyMayExist returns false, i.e. the key definitely does not exist, then
// the delete is a noop. KeyMayExist only incurs in-memory look up. // the delete is a noop. KeyMayExist only incurs in-memory look up.
@ -653,7 +429,7 @@ struct Options {
// Default: emtpy vector -- no user-defined statistics collection will be // Default: emtpy vector -- no user-defined statistics collection will be
// performed. // performed.
typedef std::vector<std::shared_ptr<TablePropertiesCollector>> typedef std::vector<std::shared_ptr<TablePropertiesCollector>>
TablePropertiesCollectors; TablePropertiesCollectors;
TablePropertiesCollectors table_properties_collectors; TablePropertiesCollectors table_properties_collectors;
// Allows thread-safe inplace updates. // Allows thread-safe inplace updates.
@ -750,9 +526,266 @@ struct Options {
// Default: 2 // Default: 2
uint32_t min_partial_merge_operands; uint32_t min_partial_merge_operands;
// Create ColumnFamilyOptions with default values for all fields
ColumnFamilyOptions();
// Create ColumnFamilyOptions from Options
explicit ColumnFamilyOptions(const Options& options);
void Dump(Logger* log) const;
};
struct DBOptions {
// If true, the database will be created if it is missing.
// Default: false
bool create_if_missing;
// If true, an error is raised if the database already exists.
// Default: false
bool error_if_exists;
// If true, the implementation will do aggressive checking of the
// data it is processing and will stop early if it detects any
// errors. This may have unforeseen ramifications: for example, a
// corruption of one DB entry may cause a large number of entries to
// become unreadable or for the entire DB to become unopenable.
// If any of the writes to the database fails (Put, Delete, Merge, Write),
// the database will switch to read-only mode and fail all other
// Write operations.
// Default: true
bool paranoid_checks;
// Use the specified object to interact with the environment,
// e.g. to read/write files, schedule background work, etc.
// Default: Env::Default()
Env* env;
// Any internal progress/error information generated by the db will
// be written to info_log if it is non-nullptr, or to a file stored
// in the same directory as the DB contents if info_log is nullptr.
// Default: nullptr
shared_ptr<Logger> info_log;
InfoLogLevel info_log_level;
// Number of open files that can be used by the DB. You may need to
// increase this if your database has a large working set. Value -1 means
// files opened are always kept open. You can estimate number of files based
// on target_file_size_base and target_file_size_multiplier for level-based
// compaction. For universal-style compaction, you can usually set it to -1.
// Default: 5000
int max_open_files;
// If non-null, then we should collect metrics about database operations
// Statistics objects should not be shared between DB instances as
// it does not use any locks to prevent concurrent updates.
shared_ptr<Statistics> statistics;
// If true, then the contents of data files are not synced
// to stable storage. Their contents remain in the OS buffers till the
// OS decides to flush them. This option is good for bulk-loading
// of data. Once the bulk-loading is complete, please issue a
// sync to the OS to flush all dirty buffesrs to stable storage.
// Default: false
bool disableDataSync;
// If true, then every store to stable storage will issue a fsync.
// If false, then every store to stable storage will issue a fdatasync.
// This parameter should be set to true while storing data to
// filesystem like ext3 that can lose files after a reboot.
// Default: false
bool use_fsync;
// This number controls how often a new scribe log about
// db deploy stats is written out.
// -1 indicates no logging at all.
// Default value is 1800 (half an hour).
int db_stats_log_interval;
// This specifies the info LOG dir.
// If it is empty, the log files will be in the same dir as data.
// If it is non empty, the log files will be in the specified dir,
// and the db data dir's absolute path will be used as the log file
// name's prefix.
std::string db_log_dir;
// This specifies the absolute dir path for write-ahead logs (WAL).
// If it is empty, the log files will be in the same dir as data,
// dbname is used as the data dir by default
// If it is non empty, the log files will be in kept the specified dir.
// When destroying the db,
// all log files in wal_dir and the dir itself is deleted
std::string wal_dir;
// The periodicity when obsolete files get deleted. The default
// value is 6 hours. The files that get out of scope by compaction
// process will still get automatically delete on every compaction,
// regardless of this setting
uint64_t delete_obsolete_files_period_micros;
// Maximum number of concurrent background compaction jobs, submitted to
// the default LOW priority thread pool.
// If you're increasing this, also consider increasing number of threads in
// LOW priority thread pool. For more information, see
// Env::SetBackgroundThreads
// Default: 1
int max_background_compactions;
// Maximum number of concurrent background memtable flush jobs, submitted to
// the HIGH priority thread pool.
//
// By default, all background jobs (major compaction and memtable flush) go
// to the LOW priority pool. If this option is set to a positive number,
// memtable flush jobs will be submitted to the HIGH priority pool.
// It is important when the same Env is shared by multiple db instances.
// Without a separate pool, long running major compaction jobs could
// potentially block memtable flush jobs of other db instances, leading to
// unnecessary Put stalls.
//
// If you're increasing this, also consider increasing number of threads in
// HIGH priority thread pool. For more information, see
// Env::SetBackgroundThreads
// Default: 1
int max_background_flushes;
// Specify the maximal size of the info log file. If the log file
// is larger than `max_log_file_size`, a new info log file will
// be created.
// If max_log_file_size == 0, all logs will be written to one
// log file.
size_t max_log_file_size;
// Time for the info log file to roll (in seconds).
// If specified with non-zero value, log file will be rolled
// if it has been active longer than `log_file_time_to_roll`.
// Default: 0 (disabled)
size_t log_file_time_to_roll;
// Maximal info log files to be kept.
// Default: 1000
size_t keep_log_file_num;
// manifest file is rolled over on reaching this limit.
// The older manifest file be deleted.
// The default value is MAX_INT so that roll-over does not take place.
uint64_t max_manifest_file_size;
// Number of shards used for table cache.
int table_cache_numshardbits;
// During data eviction of table's LRU cache, it would be inefficient
// to strictly follow LRU because this piece of memory will not really
// be released unless its refcount falls to zero. Instead, make two
// passes: the first pass will release items with refcount = 1,
// and if not enough space releases after scanning the number of
// elements specified by this parameter, we will remove items in LRU
// order.
int table_cache_remove_scan_count_limit;
// The following two fields affect how archived logs will be deleted.
// 1. If both set to 0, logs will be deleted asap and will not get into
// the archive.
// 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
// WAL files will be checked every 10 min and if total size is greater
// then WAL_size_limit_MB, they will be deleted starting with the
// earliest until size_limit is met. All empty files will be deleted.
// 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
// WAL files will be checked every WAL_ttl_secondsi / 2 and those that
// are older than WAL_ttl_seconds will be deleted.
// 4. If both are not 0, WAL files will be checked every 10 min and both
// checks will be performed with ttl being first.
uint64_t WAL_ttl_seconds;
uint64_t WAL_size_limit_MB;
// Number of bytes to preallocate (via fallocate) the manifest
// files. Default is 4mb, which is reasonable to reduce random IO
// as well as prevent overallocation for mounts that preallocate
// large amounts of data (such as xfs's allocsize option).
size_t manifest_preallocation_size;
// Data being read from file storage may be buffered in the OS
// Default: true
bool allow_os_buffer;
// Allow the OS to mmap file for reading sst tables. Default: false
bool allow_mmap_reads;
// Allow the OS to mmap file for writing. Default: false
bool allow_mmap_writes;
// Disable child process inherit open files. Default: true
bool is_fd_close_on_exec;
// Skip log corruption error on recovery (If client is ok with
// losing most recent changes)
// Default: false
bool skip_log_error_on_recovery;
// if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
// Default: 3600 (1 hour)
unsigned int stats_dump_period_sec;
// If set true, will hint the underlying file system that the file
// access pattern is random, when a sst file is opened.
// Default: true
bool advise_random_on_open;
// Specify the file access pattern once a compaction is started.
// It will be applied to all input files of a compaction.
// Default: NORMAL
enum {
NONE,
NORMAL,
SEQUENTIAL,
WILLNEED
} access_hint_on_compaction_start;
// Use adaptive mutex, which spins in the user space before resorting
// to kernel. This could reduce context switch when the mutex is not
// heavily contended. However, if the mutex is hot, we could end up
// wasting spin time.
// Default: false
bool use_adaptive_mutex;
// Allows OS to incrementally sync files to disk while they are being
// written, asynchronously, in the background.
// Issue one request for every bytes_per_sync written. 0 turns it off.
// Default: 0
uint64_t bytes_per_sync;
// Allow RocksDB to use thread local storage to optimize performance. // Allow RocksDB to use thread local storage to optimize performance.
// Default: true // Default: true
bool allow_thread_local; bool allow_thread_local;
// Create DBOptions with default values for all fields
DBOptions();
// Create DBOptions from Options
explicit DBOptions(const Options& options);
void Dump(Logger* log) const;
};
// Options to control the behavior of a database (passed to DB::Open)
struct Options : public DBOptions, public ColumnFamilyOptions {
// Create an Options object with default values for all fields.
Options() :
DBOptions(),
ColumnFamilyOptions() {}
Options(const DBOptions& db_options,
const ColumnFamilyOptions& column_family_options)
: DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
void Dump(Logger* log) const;
// Set appropriate parameters for bulk loading.
// The reason that this is a function that returns "this" instead of a
// constructor is to enable chaining of multiple similar calls in the future.
//
// All data will be in level 0 without any automatic compaction.
// It's recommended to manually call CompactRange(NULL, NULL) before reading
// from the database, because otherwise the read can be very slow.
Options* PrepareForBulkLoad();
}; };
// //

@ -64,7 +64,11 @@ struct PerfContext {
uint64_t write_memtable_time; uint64_t write_memtable_time;
}; };
#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
extern PerfContext perf_context;
#else
extern __thread PerfContext perf_context; extern __thread PerfContext perf_context;
#endif
} }

@ -31,6 +31,7 @@
namespace rocksdb { namespace rocksdb {
class Slice; class Slice;
class ColumnFamilyHandle;
struct SliceParts; struct SliceParts;
class WriteBatch { class WriteBatch {
@ -39,19 +40,34 @@ class WriteBatch {
~WriteBatch(); ~WriteBatch();
// Store the mapping "key->value" in the database. // Store the mapping "key->value" in the database.
void Put(const Slice& key, const Slice& value); void Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value);
void Put(const Slice& key, const Slice& value) {
Put(nullptr, key, value);
}
// Variant of Put() that gathers output like writev(2). The key and value // Variant of Put() that gathers output like writev(2). The key and value
// that will be written to the database are concatentations of arrays of // that will be written to the database are concatentations of arrays of
// slices. // slices.
void Put(const SliceParts& key, const SliceParts& value); void Put(ColumnFamilyHandle* column_family, const SliceParts& key,
const SliceParts& value);
void Put(const SliceParts& key, const SliceParts& value) {
Put(nullptr, key, value);
}
// Merge "value" with the existing value of "key" in the database. // Merge "value" with the existing value of "key" in the database.
// "key->merge(existing, value)" // "key->merge(existing, value)"
void Merge(const Slice& key, const Slice& value); void Merge(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value);
void Merge(const Slice& key, const Slice& value) {
Merge(nullptr, key, value);
}
// If the database contains a mapping for "key", erase it. Else do nothing. // If the database contains a mapping for "key", erase it. Else do nothing.
void Delete(const Slice& key); void Delete(ColumnFamilyHandle* column_family, const Slice& key);
void Delete(const Slice& key) {
Delete(nullptr, key);
}
// Append a blob of arbitrary size to the records in this batch. The blob will // Append a blob of arbitrary size to the records in this batch. The blob will
// be stored in the transaction log but not in any other file. In particular, // be stored in the transaction log but not in any other file. In particular,
@ -72,14 +88,46 @@ class WriteBatch {
class Handler { class Handler {
public: public:
virtual ~Handler(); virtual ~Handler();
virtual void Put(const Slice& key, const Slice& value) = 0; // default implementation will just call Put without column family for
// backwards compatibility. If the column family is not default,
// the function is noop
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value) {
if (column_family_id == 0) {
// Put() historically doesn't return status. We didn't want to be
// backwards incompatible so we didn't change the return status
// (this is a public API). We do an ordinary get and return Status::OK()
Put(key, value);
return Status::OK();
}
return Status::InvalidArgument(
"non-default column family and PutCF not implemented");
}
virtual void Put(const Slice& key, const Slice& value);
// Merge and LogData are not pure virtual. Otherwise, we would break // Merge and LogData are not pure virtual. Otherwise, we would break
// existing clients of Handler on a source code level. The default // existing clients of Handler on a source code level. The default
// implementation of Merge simply throws a runtime exception. // implementation of Merge simply throws a runtime exception.
virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
const Slice& value) {
if (column_family_id == 0) {
Merge(key, value);
return Status::OK();
}
return Status::InvalidArgument(
"non-default column family and MergeCF not implemented");
}
virtual void Merge(const Slice& key, const Slice& value); virtual void Merge(const Slice& key, const Slice& value);
// The default implementation of LogData does nothing. // The default implementation of LogData does nothing.
virtual void LogData(const Slice& blob); virtual void LogData(const Slice& blob);
virtual void Delete(const Slice& key) = 0; virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
if (column_family_id == 0) {
Delete(key);
return Status::OK();
}
return Status::InvalidArgument(
"non-default column family and DeleteCF not implemented");
}
virtual void Delete(const Slice& key);
// Continue is called by WriteBatch::Iterate. If it returns false, // Continue is called by WriteBatch::Iterate. If it returns false,
// iteration is halted. Otherwise, it continues iterating. The default // iteration is halted. Otherwise, it continues iterating. The default
// implementation always returns true. // implementation always returns true.

@ -21,40 +21,49 @@ class StackableDB : public DB {
return db_; return db_;
} }
using DB::Put;
virtual Status Put(const WriteOptions& options, virtual Status Put(const WriteOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
const Slice& val) override { const Slice& val) override {
return db_->Put(options, key, val); return db_->Put(options, column_family, key, val);
} }
using DB::Get;
virtual Status Get(const ReadOptions& options, virtual Status Get(const ReadOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) override { std::string* value) override {
return db_->Get(options, key, value); return db_->Get(options, column_family, key, value);
} }
virtual std::vector<Status> MultiGet(const ReadOptions& options, using DB::MultiGet;
const std::vector<Slice>& keys, virtual std::vector<Status> MultiGet(
std::vector<std::string>* values) const ReadOptions& options,
override { const std::vector<ColumnFamilyHandle*>& column_family,
return db_->MultiGet(options, keys, values); const std::vector<Slice>& keys,
std::vector<std::string>* values) override {
return db_->MultiGet(options, column_family, keys, values);
} }
using DB::KeyMayExist;
virtual bool KeyMayExist(const ReadOptions& options, virtual bool KeyMayExist(const ReadOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
std::string* value, std::string* value,
bool* value_found = nullptr) override { bool* value_found = nullptr) override {
return db_->KeyMayExist(options, key, value, value_found); return db_->KeyMayExist(options, column_family, key, value, value_found);
} }
virtual Status Delete(const WriteOptions& wopts, const Slice& key) override { using DB::Delete;
return db_->Delete(wopts, key); virtual Status Delete(const WriteOptions& wopts,
ColumnFamilyHandle* column_family,
const Slice& key) override {
return db_->Delete(wopts, column_family, key);
} }
using DB::Merge;
virtual Status Merge(const WriteOptions& options, virtual Status Merge(const WriteOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override { const Slice& value) override {
return db_->Merge(options, key, value); return db_->Merge(options, column_family, key, value);
} }
@ -63,10 +72,20 @@ class StackableDB : public DB {
return db_->Write(opts, updates); return db_->Write(opts, updates);
} }
virtual Iterator* NewIterator(const ReadOptions& opts) override { using DB::NewIterator;
return db_->NewIterator(opts); virtual Iterator* NewIterator(const ReadOptions& opts,
ColumnFamilyHandle* column_family) override {
return db_->NewIterator(opts, column_family);
}
virtual Status NewIterators(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_families,
std::vector<Iterator*>* iterators) {
return db_->NewIterators(options, column_families, iterators);
} }
virtual const Snapshot* GetSnapshot() override { virtual const Snapshot* GetSnapshot() override {
return db_->GetSnapshot(); return db_->GetSnapshot();
} }
@ -75,32 +94,43 @@ class StackableDB : public DB {
return db_->ReleaseSnapshot(snapshot); return db_->ReleaseSnapshot(snapshot);
} }
virtual bool GetProperty(const Slice& property, std::string* value) using DB::GetProperty;
override { virtual bool GetProperty(ColumnFamilyHandle* column_family,
return db_->GetProperty(property, value); const Slice& property, std::string* value) override {
return db_->GetProperty(column_family, property, value);
} }
virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) using DB::GetApproximateSizes;
override { virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
return db_->GetApproximateSizes(r, n, sizes); const Range* r, int n,
uint64_t* sizes) override {
return db_->GetApproximateSizes(column_family, r, n, sizes);
} }
virtual Status CompactRange(const Slice* begin, const Slice* end, using DB::CompactRange;
virtual Status CompactRange(ColumnFamilyHandle* column_family,
const Slice* begin, const Slice* end,
bool reduce_level = false, bool reduce_level = false,
int target_level = -1) override { int target_level = -1) override {
return db_->CompactRange(begin, end, reduce_level, target_level); return db_->CompactRange(column_family, begin, end, reduce_level,
target_level);
} }
virtual int NumberLevels() override { using DB::NumberLevels;
return db_->NumberLevels(); virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
return db_->NumberLevels(column_family);
} }
virtual int MaxMemCompactionLevel() override { using DB::MaxMemCompactionLevel;
return db_->MaxMemCompactionLevel(); virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family)
override {
return db_->MaxMemCompactionLevel(column_family);
} }
virtual int Level0StopWriteTrigger() override { using DB::Level0StopWriteTrigger;
return db_->Level0StopWriteTrigger(); virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family)
override {
return db_->Level0StopWriteTrigger(column_family);
} }
virtual const std::string& GetName() const override { virtual const std::string& GetName() const override {
@ -111,12 +141,16 @@ class StackableDB : public DB {
return db_->GetEnv(); return db_->GetEnv();
} }
virtual const Options& GetOptions() const override { using DB::GetOptions;
return db_->GetOptions(); virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const
override {
return db_->GetOptions(column_family);
} }
virtual Status Flush(const FlushOptions& fopts) override { using DB::Flush;
return db_->Flush(fopts); virtual Status Flush(const FlushOptions& fopts,
ColumnFamilyHandle* column_family) override {
return db_->Flush(fopts, column_family);
} }
virtual Status DisableFileDeletions() override { virtual Status DisableFileDeletions() override {
@ -148,8 +182,10 @@ class StackableDB : public DB {
return db_->GetDbIdentity(identity); return db_->GetDbIdentity(identity);
} }
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { using DB::GetPropertiesOfAllTables;
return db_->GetPropertiesOfAllTables(props); virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
TablePropertiesCollection* props) {
return db_->GetPropertiesOfAllTables(column_family, props);
} }
virtual Status GetUpdatesSince( virtual Status GetUpdatesSince(
@ -158,6 +194,10 @@ class StackableDB : public DB {
return db_->GetUpdatesSince(seq_number, iter, read_options); return db_->GetUpdatesSince(seq_number, iter, read_options);
} }
virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
return db_->DefaultColumnFamily();
}
protected: protected:
DB* db_; DB* db_;
}; };

@ -208,7 +208,9 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options); rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options);
mem->Ref(); mem->Ref();
std::string state; std::string state;
rocksdb::Status s = rocksdb::WriteBatchInternal::InsertInto(b, mem, &options); rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
rocksdb::Status s =
rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default);
int count = 0; int count = 0;
rocksdb::Iterator* iter = mem->NewIterator(); rocksdb::Iterator* iter = mem->NewIterator();
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {

@ -127,13 +127,6 @@ extern bool Snappy_GetUncompressedLength(const char* input, size_t length,
extern bool Snappy_Uncompress(const char* input_data, size_t input_length, extern bool Snappy_Uncompress(const char* input_data, size_t input_length,
char* output); char* output);
// ------------------ Miscellaneous -------------------
// If heap profiling is not supported, returns false.
// Else repeatedly calls (*func)(arg, data, n) and then returns true.
// The concatenation of all "data[0,n-1]" fragments is the heap profile.
extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg);
} // namespace port } // namespace port
} // namespace rocksdb } // namespace rocksdb

@ -476,10 +476,6 @@ inline bool LZ4HC_Compress(const CompressionOptions &opts, const char* input,
return false; return false;
} }
inline bool GetHeapProfile(void (*func)(void *, const char *, int), void *arg) {
return false;
}
#define CACHE_LINE_SIZE 64U #define CACHE_LINE_SIZE 64U
} // namespace port } // namespace port

@ -45,7 +45,9 @@ namespace {
// The longest the prefix of the cache key used to identify blocks can be. // The longest the prefix of the cache key used to identify blocks can be.
// We are using the fact that we know for Posix files the unique ID is three // We are using the fact that we know for Posix files the unique ID is three
// varints. // varints.
const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1; // For some reason, compiling for iOS complains that this variable is unused
const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) =
kMaxVarint64Length * 3 + 1;
// Read the block identified by "handle" from "file". // Read the block identified by "handle" from "file".
// The only relevant option is options.verify_checksums for now. // The only relevant option is options.verify_checksums for now.
@ -105,7 +107,7 @@ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
Statistics* statistics) { Statistics* statistics) {
auto cache_handle = block_cache->Lookup(key); auto cache_handle = block_cache->Lookup(key);
if (cache_handle != nullptr) { if (cache_handle != nullptr) {
BumpPerfCount(&perf_context.block_cache_hit_count); PERF_COUNTER_ADD(block_cache_hit_count, 1);
// overall cache hit // overall cache hit
RecordTick(statistics, BLOCK_CACHE_HIT); RecordTick(statistics, BLOCK_CACHE_HIT);
// block-type specific cache hit // block-type specific cache hit

@ -46,6 +46,9 @@ class FilterBlockBuilder {
bool SamePrefix(const Slice &key1, const Slice &key2) const; bool SamePrefix(const Slice &key1, const Slice &key2) const;
void GenerateFilter(); void GenerateFilter();
// important: all of these might point to invalid addresses
// at the time of destruction of this filter block. destructor
// should NOT dereference them.
const FilterPolicy* policy_; const FilterPolicy* policy_;
const SliceTransform* prefix_extractor_; const SliceTransform* prefix_extractor_;
bool whole_key_filtering_; bool whole_key_filtering_;

@ -125,12 +125,11 @@ Status ReadBlockContents(RandomAccessFile* file,
char* buf = new char[n + kBlockTrailerSize]; char* buf = new char[n + kBlockTrailerSize];
Slice contents; Slice contents;
StopWatchNano timer(env); PERF_TIMER_AUTO(block_read_time);
StartPerfTimer(&timer);
Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
BumpPerfCount(&perf_context.block_read_count); PERF_TIMER_MEASURE(block_read_time);
BumpPerfCount(&perf_context.block_read_byte, n + kBlockTrailerSize); PERF_COUNTER_ADD(block_read_count, 1);
BumpPerfTime(&perf_context.block_read_time, &timer); PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize);
if (!s.ok()) { if (!s.ok()) {
delete[] buf; delete[] buf;
@ -151,7 +150,7 @@ Status ReadBlockContents(RandomAccessFile* file,
s = Status::Corruption("block checksum mismatch"); s = Status::Corruption("block checksum mismatch");
return s; return s;
} }
BumpPerfTime(&perf_context.block_checksum_time, &timer); PERF_TIMER_MEASURE(block_checksum_time);
} }
// If the caller has requested that the block not be uncompressed // If the caller has requested that the block not be uncompressed
@ -175,7 +174,7 @@ Status ReadBlockContents(RandomAccessFile* file,
s = UncompressBlockContents(data, n, result); s = UncompressBlockContents(data, n, result);
delete[] buf; delete[] buf;
} }
BumpPerfTime(&perf_context.block_decompress_time, &timer); PERF_TIMER_STOP(block_decompress_time);
return s; return s;
} }

@ -25,16 +25,14 @@ namespace {
class MergingIterator : public Iterator { class MergingIterator : public Iterator {
public: public:
MergingIterator(Env* const env, const Comparator* comparator, MergingIterator(const Comparator* comparator, Iterator** children, int n)
Iterator** children, int n)
: comparator_(comparator), : comparator_(comparator),
children_(n), children_(n),
current_(nullptr), current_(nullptr),
use_heap_(true), use_heap_(true),
env_(env),
direction_(kForward), direction_(kForward),
maxHeap_(NewMaxIterHeap(comparator_)), maxHeap_(NewMaxIterHeap(comparator_)),
minHeap_ (NewMinIterHeap(comparator_)) { minHeap_(NewMinIterHeap(comparator_)) {
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
children_[i].Set(children[i]); children_[i].Set(children[i]);
} }
@ -79,13 +77,13 @@ class MergingIterator : public Iterator {
// Invalidate the heap. // Invalidate the heap.
use_heap_ = false; use_heap_ = false;
IteratorWrapper* first_child = nullptr; IteratorWrapper* first_child = nullptr;
StopWatchNano child_seek_timer(env_, false); PERF_TIMER_DECLARE();
StopWatchNano min_heap_timer(env_, false);
for (auto& child : children_) { for (auto& child : children_) {
StartPerfTimer(&child_seek_timer); PERF_TIMER_START(seek_child_seek_time);
child.Seek(target); child.Seek(target);
BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer); PERF_TIMER_STOP(seek_child_seek_time);
BumpPerfCount(&perf_context.seek_child_seek_count); PERF_COUNTER_ADD(seek_child_seek_count, 1);
if (child.Valid()) { if (child.Valid()) {
// This child has valid key // This child has valid key
@ -97,26 +95,24 @@ class MergingIterator : public Iterator {
} else { } else {
// We have more than one children with valid keys. Initialize // We have more than one children with valid keys. Initialize
// the heap and put the first child into the heap. // the heap and put the first child into the heap.
StartPerfTimer(&min_heap_timer); PERF_TIMER_START(seek_min_heap_time);
ClearHeaps(); ClearHeaps();
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
StartPerfTimer(&min_heap_timer);
minHeap_.push(first_child); minHeap_.push(first_child);
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); PERF_TIMER_STOP(seek_min_heap_time);
} }
} }
if (use_heap_) { if (use_heap_) {
StartPerfTimer(&min_heap_timer); PERF_TIMER_START(seek_min_heap_time);
minHeap_.push(&child); minHeap_.push(&child);
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); PERF_TIMER_STOP(seek_min_heap_time);
} }
} }
} }
if (use_heap_) { if (use_heap_) {
// If heap is valid, need to put the smallest key to curent_. // If heap is valid, need to put the smallest key to curent_.
StartPerfTimer(&min_heap_timer); PERF_TIMER_START(seek_min_heap_time);
FindSmallest(); FindSmallest();
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); PERF_TIMER_STOP(seek_min_heap_time);
} else { } else {
// The heap is not valid, then the current_ iterator is the first // The heap is not valid, then the current_ iterator is the first
// one, or null if there is no first child. // one, or null if there is no first child.
@ -232,7 +228,6 @@ class MergingIterator : public Iterator {
// This flag is always true for reverse direction, as we always use heap for // This flag is always true for reverse direction, as we always use heap for
// the reverse iterating case. // the reverse iterating case.
bool use_heap_; bool use_heap_;
Env* const env_;
// Which direction is the iterator moving? // Which direction is the iterator moving?
enum Direction { enum Direction {
kForward, kForward,
@ -272,15 +267,14 @@ void MergingIterator::ClearHeaps() {
} }
} // namespace } // namespace
Iterator* NewMergingIterator(Env* const env, const Comparator* cmp, Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) {
Iterator** list, int n) {
assert(n >= 0); assert(n >= 0);
if (n == 0) { if (n == 0) {
return NewEmptyIterator(); return NewEmptyIterator();
} else if (n == 1) { } else if (n == 1) {
return list[0]; return list[0];
} else { } else {
return new MergingIterator(env, cmp, list, n); return new MergingIterator(cmp, list, n);
} }
} }

@ -23,8 +23,7 @@ class Env;
// key is present in K child iterators, it will be yielded K times. // key is present in K child iterators, it will be yielded K times.
// //
// REQUIRES: n >= 0 // REQUIRES: n >= 0
extern Iterator* NewMergingIterator(Env* const env, extern Iterator* NewMergingIterator(const Comparator* comparator,
const Comparator* comparator,
Iterator** children, int n); Iterator** children, int n);
} // namespace rocksdb } // namespace rocksdb

@ -81,10 +81,9 @@ class PlainTableIterator : public Iterator {
bool use_prefix_seek_; bool use_prefix_seek_;
uint32_t offset_; uint32_t offset_;
uint32_t next_offset_; uint32_t next_offset_;
Slice key_; IterKey key_;
Slice value_; Slice value_;
Status status_; Status status_;
std::string tmp_str_;
// No copying allowed // No copying allowed
PlainTableIterator(const PlainTableIterator&) = delete; PlainTableIterator(const PlainTableIterator&) = delete;
void operator=(const Iterator&) = delete; void operator=(const Iterator&) = delete;
@ -720,9 +719,7 @@ void PlainTableIterator::Next() {
status_ = table_->Next(&next_offset_, &parsed_key, &value_); status_ = table_->Next(&next_offset_, &parsed_key, &value_);
if (status_.ok()) { if (status_.ok()) {
// Make a copy in this case. TODO optimize. // Make a copy in this case. TODO optimize.
tmp_str_.clear(); key_.SetInternalKey(parsed_key);
AppendInternalKey(&tmp_str_, parsed_key);
key_ = Slice(tmp_str_);
} else { } else {
offset_ = next_offset_ = table_->data_end_offset_; offset_ = next_offset_ = table_->data_end_offset_;
} }
@ -735,7 +732,7 @@ void PlainTableIterator::Prev() {
Slice PlainTableIterator::key() const { Slice PlainTableIterator::key() const {
assert(Valid()); assert(Valid());
return key_; return key_.GetKey();
} }
Slice PlainTableIterator::value() const { Slice PlainTableIterator::value() const {

@ -1554,7 +1554,8 @@ TEST(MemTableTest, Simple) {
batch.Put(std::string("k2"), std::string("v2")); batch.Put(std::string("k2"), std::string("v2"));
batch.Put(std::string("k3"), std::string("v3")); batch.Put(std::string("k3"), std::string("v3"));
batch.Put(std::string("largekey"), std::string("vlarge")); batch.Put(std::string("largekey"), std::string("vlarge"));
ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable, &options).ok()); ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options);
ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok());
Iterator* iter = memtable->NewIterator(); Iterator* iter = memtable->NewIterator();
iter->SeekToFirst(); iter->SeekToFirst();

@ -0,0 +1,71 @@
TMP_DIR="/tmp/rocksdb-sanity-test"
if [ "$#" -lt 2 ]; then
echo "usage: ./auto_sanity_test.sh [new_commit] [old_commit]"
echo "Missing either [new_commit] or [old_commit], perform sanity check with the latest and 10th latest commits."
recent_commits=`git log | grep -e "^commit [a-z0-9]\+$"| head -n10 | sed -e 's/commit //g'`
commit_new=`echo "$recent_commits" | head -n1`
commit_old=`echo "$recent_commits" | tail -n1`
echo "the most recent commits are:"
echo "$recent_commits"
else
commit_new=$1
commit_old=$2
fi
if [ ! -d $TMP_DIR ]; then
mkdir $TMP_DIR
fi
dir_new="${TMP_DIR}/${commit_new}"
dir_old="${TMP_DIR}/${commit_old}"
function makestuff() {
echo "make clean"
make clean > /dev/null
echo "make db_sanity_test -j32"
make db_sanity_test -j32 > /dev/null
if [ $? -ne 0 ]; then
echo "[ERROR] Failed to perform 'make db_sanity_test'"
exit 1
fi
}
rm -r -f $dir_new
rm -r -f $dir_old
echo "Running db sanity check with commits $commit_new and $commit_old."
echo "============================================================="
echo "Making build $commit_new"
makestuff
mv db_sanity_test new_db_sanity_test
echo "Creating db based on the new commit --- $commit_new"
./new_db_sanity_test $dir_new create
echo "============================================================="
echo "Making build $commit_old"
makestuff
mv db_sanity_test old_db_sanity_test
echo "Creating db based on the old commit --- $commit_old"
./old_db_sanity_test $dir_old create
echo "============================================================="
echo "Verifying new db $dir_new using the old commit --- $commit_old"
./old_db_sanity_test $dir_new verify
if [ $? -ne 0 ]; then
echo "[ERROR] Verification of $dir_new using commit $commit_old failed."
exit 2
fi
echo "============================================================="
echo "Verifying old db $dir_old using the new commit --- $commit_new"
./new_db_sanity_test $dir_old verify
if [ $? -ne 0 ]; then
echo "[ERROR] Verification of $dir_old using commit $commit_new failed."
exit 2
fi
rm old_db_sanity_test
rm new_db_sanity_test
echo "Auto sanity test passed!"

@ -88,6 +88,7 @@ def main(argv):
--open_files=500000 --open_files=500000
--verify_checksum=1 --verify_checksum=1
--sync=0 --sync=0
--progress_reports=0
--disable_wal=0 --disable_wal=0
--disable_data_sync=1 --disable_data_sync=1
--target_file_size_base=2097152 --target_file_size_base=2097152

@ -101,6 +101,7 @@ def main(argv):
--open_files=500000 --open_files=500000
--verify_checksum=1 --verify_checksum=1
--sync=0 --sync=0
--progress_reports=0
--disable_wal=0 --disable_wal=0
--disable_data_sync=1 --disable_data_sync=1
--target_file_size_base=2097152 --target_file_size_base=2097152

@ -60,14 +60,16 @@ static bool ValidateUint32Range(const char* flagname, uint64_t value) {
return true; return true;
} }
DEFINE_uint64(seed, 2341234, "Seed for PRNG"); DEFINE_uint64(seed, 2341234, "Seed for PRNG");
static const bool FLAGS_seed_dummy = static const bool FLAGS_seed_dummy __attribute__((unused)) =
google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range); google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
DEFINE_int64(max_key, 1 * KB * KB * KB, DEFINE_int64(max_key, 1 * KB* KB,
"Max number of key/values to place in database"); "Max number of key/values to place in database");
DEFINE_int32(column_families, 10, "Number of column families");
DEFINE_bool(test_batches_snapshots, false, DEFINE_bool(test_batches_snapshots, false,
"If set, the test uses MultiGet(), MultiPut() and MultiDelete()" "If set, the test uses MultiGet(), Multiut() and MultiDelete()"
" which read/write/delete multiple keys in a batch. In this mode," " which read/write/delete multiple keys in a batch. In this mode,"
" we do not verify db content by comparing the content with the " " we do not verify db content by comparing the content with the "
"pre-allocated array. Instead, we do partial verification inside" "pre-allocated array. Instead, we do partial verification inside"
@ -95,7 +97,10 @@ DEFINE_bool(histogram, false, "Print histogram of operation timings");
DEFINE_bool(destroy_db_initially, true, DEFINE_bool(destroy_db_initially, true,
"Destroys the database dir before start if this is true"); "Destroys the database dir before start if this is true");
DEFINE_bool (verbose, false, "Verbose"); DEFINE_bool(verbose, false, "Verbose");
DEFINE_bool(progress_reports, true,
"If true, db_stress will report number of finished operations");
DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size, DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size,
"Number of bytes to buffer in memtable before compacting"); "Number of bytes to buffer in memtable before compacting");
@ -146,6 +151,10 @@ DEFINE_int32(max_background_compactions,
"The maximum number of concurrent background compactions " "The maximum number of concurrent background compactions "
"that can occur in parallel."); "that can occur in parallel.");
DEFINE_int32(max_background_flushes, rocksdb::Options().max_background_flushes,
"The maximum number of concurrent background flushes "
"that can occur in parallel.");
DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger" DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger"
" compaction in universal style"); " compaction in universal style");
@ -158,6 +167,11 @@ DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
DEFINE_int32(universal_max_size_amplification_percent, 0, DEFINE_int32(universal_max_size_amplification_percent, 0,
"The max size amplification for universal style compaction"); "The max size amplification for universal style compaction");
DEFINE_int32(clear_column_family_one_in, 1000000,
"With a chance of 1/N, delete a column family and then recreate "
"it again. If N == 0, never drop/create column families. "
"When test_batches_snapshots is true, this flag has no effect");
DEFINE_int64(cache_size, 2 * KB * KB * KB, DEFINE_int64(cache_size, 2 * KB * KB * KB,
"Number of bytes to use as a cache of uncompressed data."); "Number of bytes to use as a cache of uncompressed data.");
@ -170,8 +184,8 @@ static bool ValidateInt32Positive(const char* flagname, int32_t value) {
return true; return true;
} }
DEFINE_int32(reopen, 10, "Number of times database reopens"); DEFINE_int32(reopen, 10, "Number of times database reopens");
static const bool FLAGS_reopen_dummy = static const bool FLAGS_reopen_dummy __attribute__((unused)) =
google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive); google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. " DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
"Negative means use default settings."); "Negative means use default settings.");
@ -198,9 +212,9 @@ DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
DEFINE_int32(kill_random_test, 0, DEFINE_int32(kill_random_test, 0,
"If non-zero, kill at various points in source code with " "If non-zero, kill at various points in source code with "
"probability 1/this"); "probability 1/this");
static const bool FLAGS_kill_random_test_dummy = static const bool FLAGS_kill_random_test_dummy __attribute__((unused)) =
google::RegisterFlagValidator(&FLAGS_kill_random_test, google::RegisterFlagValidator(&FLAGS_kill_random_test,
&ValidateInt32Positive); &ValidateInt32Positive);
extern int rocksdb_kill_odds; extern int rocksdb_kill_odds;
DEFINE_bool(disable_wal, false, "If true, do not write WAL for write."); DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
@ -226,42 +240,37 @@ static bool ValidateInt32Percent(const char* flagname, int32_t value) {
} }
DEFINE_int32(readpercent, 10, DEFINE_int32(readpercent, 10,
"Ratio of reads to total workload (expressed as a percentage)"); "Ratio of reads to total workload (expressed as a percentage)");
static const bool FLAGS_readpercent_dummy = static const bool FLAGS_readpercent_dummy __attribute__((unused)) =
google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent); google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
DEFINE_int32(prefixpercent, 20, DEFINE_int32(prefixpercent, 20,
"Ratio of prefix iterators to total workload (expressed as a" "Ratio of prefix iterators to total workload (expressed as a"
" percentage)"); " percentage)");
static const bool FLAGS_prefixpercent_dummy = static const bool FLAGS_prefixpercent_dummy __attribute__((unused)) =
google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent); google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
DEFINE_int32(writepercent, 45, DEFINE_int32(writepercent, 45,
" Ratio of deletes to total workload (expressed as a percentage)"); " Ratio of deletes to total workload (expressed as a percentage)");
static const bool FLAGS_writepercent_dummy = static const bool FLAGS_writepercent_dummy __attribute__((unused)) =
google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent); google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
DEFINE_int32(delpercent, 15, DEFINE_int32(delpercent, 15,
"Ratio of deletes to total workload (expressed as a percentage)"); "Ratio of deletes to total workload (expressed as a percentage)");
static const bool FLAGS_delpercent_dummy = static const bool FLAGS_delpercent_dummy __attribute__((unused)) =
google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent); google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload" DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload"
" (expressed as a percentage)"); " (expressed as a percentage)");
static const bool FLAGS_iterpercent_dummy = static const bool FLAGS_iterpercent_dummy __attribute__((unused)) =
google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent); google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run"); DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
static const bool FLAGS_num_iterations_dummy = static const bool FLAGS_num_iterations_dummy __attribute__((unused)) =
google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range); google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
DEFINE_bool(disable_seek_compaction, false, DEFINE_bool(disable_seek_compaction, false,
"Option to disable compation triggered by read."); "Option to disable compation triggered by read.");
DEFINE_uint64(delete_obsolete_files_period_micros, 0,
"Option to delete obsolete files periodically"
"0 means that obsolete files are "
" deleted after every compaction run.");
enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
assert(ctype); assert(ctype);
@ -290,21 +299,21 @@ DEFINE_string(hdfs, "", "Name of hdfs environment");
// posix or hdfs environment // posix or hdfs environment
static rocksdb::Env* FLAGS_env = rocksdb::Env::Default(); static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
DEFINE_uint64(ops_per_thread, 600000, "Number of operations per thread."); DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
static const bool FLAGS_ops_per_thread_dummy = static const bool FLAGS_ops_per_thread_dummy __attribute__((unused)) =
google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range); google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock"); DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
static const bool FLAGS_log2_keys_per_lock_dummy = static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((unused)) =
google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock, google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock,
&ValidateUint32Range); &ValidateUint32Range);
DEFINE_int32(purge_redundant_percent, 50, DEFINE_int32(purge_redundant_percent, 50,
"Percentage of times we want to purge redundant keys in memory " "Percentage of times we want to purge redundant keys in memory "
"before flushing"); "before flushing");
static const bool FLAGS_purge_redundant_percent_dummy = static const bool FLAGS_purge_redundant_percent_dummy __attribute__((unused)) =
google::RegisterFlagValidator(&FLAGS_purge_redundant_percent, google::RegisterFlagValidator(&FLAGS_purge_redundant_percent,
&ValidateInt32Percent); &ValidateInt32Percent);
DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop" DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop"
" the delete if key not present"); " the delete if key not present");
@ -438,16 +447,18 @@ class Stats {
last_op_finish_ = now; last_op_finish_ = now;
} }
done_++; done_++;
if (done_ >= next_report_) { if (FLAGS_progress_reports) {
if (next_report_ < 1000) next_report_ += 100; if (done_ >= next_report_) {
else if (next_report_ < 5000) next_report_ += 500; if (next_report_ < 1000) next_report_ += 100;
else if (next_report_ < 10000) next_report_ += 1000; else if (next_report_ < 5000) next_report_ += 500;
else if (next_report_ < 50000) next_report_ += 5000; else if (next_report_ < 10000) next_report_ += 1000;
else if (next_report_ < 100000) next_report_ += 10000; else if (next_report_ < 50000) next_report_ += 5000;
else if (next_report_ < 500000) next_report_ += 50000; else if (next_report_ < 100000) next_report_ += 10000;
else next_report_ += 100000; else if (next_report_ < 500000) next_report_ += 50000;
fprintf(stdout, "... finished %ld ops%30s\r", done_, ""); else next_report_ += 100000;
fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
}
} }
} }
@ -515,7 +526,7 @@ class Stats {
// State shared by all concurrent executions of the same benchmark. // State shared by all concurrent executions of the same benchmark.
class SharedState { class SharedState {
public: public:
static const uint32_t SENTINEL = 0xffffffff; static const uint32_t SENTINEL;
explicit SharedState(StressTest* stress_test) : explicit SharedState(StressTest* stress_test) :
cv_(&mu_), cv_(&mu_),
@ -531,28 +542,27 @@ class SharedState {
start_verify_(false), start_verify_(false),
stress_test_(stress_test) { stress_test_(stress_test) {
if (FLAGS_test_batches_snapshots) { if (FLAGS_test_batches_snapshots) {
key_locks_ = nullptr;
values_ = nullptr;
fprintf(stdout, "No lock creation because test_batches_snapshots set\n"); fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
return; return;
} }
values_ = new uint32_t[max_key_]; values_.resize(FLAGS_column_families);
for (long i = 0; i < max_key_; i++) {
values_[i] = SENTINEL; for (int i = 0; i < FLAGS_column_families; ++i) {
values_[i] = std::vector<uint32_t>(max_key_, SENTINEL);
} }
long num_locks = (max_key_ >> log2_keys_per_lock_); long num_locks = (max_key_ >> log2_keys_per_lock_);
if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) { if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
num_locks ++; num_locks++;
}
fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families);
key_locks_.resize(FLAGS_column_families);
for (int i = 0; i < FLAGS_column_families; ++i) {
key_locks_[i] = std::vector<port::Mutex>(num_locks);
} }
fprintf(stdout, "Creating %ld locks\n", num_locks);
key_locks_ = new port::Mutex[num_locks];
} }
~SharedState() { ~SharedState() {}
delete[] values_;
delete[] key_locks_;
}
port::Mutex* GetMutex() { port::Mutex* GetMutex() {
return &mu_; return &mu_;
@ -622,26 +632,36 @@ class SharedState {
return start_verify_; return start_verify_;
} }
port::Mutex* GetMutexForKey(long key) { port::Mutex* GetMutexForKey(int cf, long key) {
return &key_locks_[key >> log2_keys_per_lock_]; return &key_locks_[cf][key >> log2_keys_per_lock_];
} }
void Put(long key, uint32_t value_base) { void LockColumnFamily(int cf) {
values_[key] = value_base; for (auto& mutex : key_locks_[cf]) {
mutex.Lock();
}
} }
uint32_t Get(long key) const { void UnlockColumnFamily(int cf) {
return values_[key]; for (auto& mutex : key_locks_[cf]) {
mutex.Unlock();
}
} }
void Delete(long key) const { void ClearColumnFamily(int cf) {
values_[key] = SENTINEL; std::fill(values_[cf].begin(), values_[cf].end(), SENTINEL);
} }
uint32_t GetSeed() const { void Put(int cf, long key, uint32_t value_base) {
return seed_; values_[cf][key] = value_base;
} }
uint32_t Get(int cf, long key) const { return values_[cf][key]; }
void Delete(int cf, long key) { values_[cf][key] = SENTINEL; }
uint32_t GetSeed() const { return seed_; }
private: private:
port::Mutex mu_; port::Mutex mu_;
port::CondVar cv_; port::CondVar cv_;
@ -657,11 +677,12 @@ class SharedState {
bool start_verify_; bool start_verify_;
StressTest* stress_test_; StressTest* stress_test_;
uint32_t *values_; std::vector<std::vector<uint32_t>> values_;
port::Mutex *key_locks_; std::vector<std::vector<port::Mutex>> key_locks_;
}; };
const uint32_t SharedState::SENTINEL = 0xffffffff;
// Per-thread state for concurrent executions of the same benchmark. // Per-thread state for concurrent executions of the same benchmark.
struct ThreadState { struct ThreadState {
uint32_t tid; // 0..n-1 uint32_t tid; // 0..n-1
@ -682,13 +703,14 @@ class StressTest {
public: public:
StressTest() StressTest()
: cache_(NewLRUCache(FLAGS_cache_size)), : cache_(NewLRUCache(FLAGS_cache_size)),
compressed_cache_(FLAGS_compressed_cache_size >= 0 ? compressed_cache_(FLAGS_compressed_cache_size >= 0
NewLRUCache(FLAGS_compressed_cache_size) : ? NewLRUCache(FLAGS_compressed_cache_size)
nullptr), : nullptr),
filter_policy_(FLAGS_bloom_bits >= 0 filter_policy_(FLAGS_bloom_bits >= 0
? NewBloomFilterPolicy(FLAGS_bloom_bits) ? NewBloomFilterPolicy(FLAGS_bloom_bits)
: nullptr), : nullptr),
db_(nullptr), db_(nullptr),
new_column_family_name_(0),
num_times_reopened_(0) { num_times_reopened_(0) {
if (FLAGS_destroy_db_initially) { if (FLAGS_destroy_db_initially) {
std::vector<std::string> files; std::vector<std::string> files;
@ -703,6 +725,10 @@ class StressTest {
} }
~StressTest() { ~StressTest() {
for (auto cf : column_families_) {
delete cf;
}
column_families_.clear();
delete db_; delete db_;
delete filter_policy_; delete filter_policy_;
} }
@ -817,9 +843,9 @@ class StressTest {
// Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ... // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
// ("9"+K, "9"+V) in DB atomically i.e in a single batch. // ("9"+K, "9"+V) in DB atomically i.e in a single batch.
// Also refer MultiGet. // Also refer MultiGet.
Status MultiPut(ThreadState* thread, Status MultiPut(ThreadState* thread, const WriteOptions& writeoptions,
const WriteOptions& writeoptions, ColumnFamilyHandle* column_family, const Slice& key,
const Slice& key, const Slice& value, size_t sz) { const Slice& value, size_t sz) {
std::string keys[10] = {"9", "8", "7", "6", "5", std::string keys[10] = {"9", "8", "7", "6", "5",
"4", "3", "2", "1", "0"}; "4", "3", "2", "1", "0"};
std::string values[10] = {"9", "8", "7", "6", "5", std::string values[10] = {"9", "8", "7", "6", "5",
@ -832,9 +858,9 @@ class StressTest {
values[i] += value.ToString(); values[i] += value.ToString();
value_slices[i] = values[i]; value_slices[i] = values[i];
if (FLAGS_use_merge) { if (FLAGS_use_merge) {
batch.Merge(keys[i], value_slices[i]); batch.Merge(column_family, keys[i], value_slices[i]);
} else { } else {
batch.Put(keys[i], value_slices[i]); batch.Put(column_family, keys[i], value_slices[i]);
} }
} }
@ -852,9 +878,8 @@ class StressTest {
// Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K) // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
// in DB atomically i.e in a single batch. Also refer MultiGet. // in DB atomically i.e in a single batch. Also refer MultiGet.
Status MultiDelete(ThreadState* thread, Status MultiDelete(ThreadState* thread, const WriteOptions& writeoptions,
const WriteOptions& writeoptions, ColumnFamilyHandle* column_family, const Slice& key) {
const Slice& key) {
std::string keys[10] = {"9", "7", "5", "3", "1", std::string keys[10] = {"9", "7", "5", "3", "1",
"8", "6", "4", "2", "0"}; "8", "6", "4", "2", "0"};
@ -862,7 +887,7 @@ class StressTest {
Status s; Status s;
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
keys[i] += key.ToString(); keys[i] += key.ToString();
batch.Delete(keys[i]); batch.Delete(column_family, keys[i]);
} }
s = db_->Write(writeoptions, &batch); s = db_->Write(writeoptions, &batch);
@ -880,9 +905,9 @@ class StressTest {
// in the same snapshot, and verifies that all the values are of the form // in the same snapshot, and verifies that all the values are of the form
// "0"+V, "1"+V,..."9"+V. // "0"+V, "1"+V,..."9"+V.
// ASSUMES that MultiPut was used to put (K, V) into the DB. // ASSUMES that MultiPut was used to put (K, V) into the DB.
Status MultiGet(ThreadState* thread, Status MultiGet(ThreadState* thread, const ReadOptions& readoptions,
const ReadOptions& readoptions, ColumnFamilyHandle* column_family, const Slice& key,
const Slice& key, std::string* value) { std::string* value) {
std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}; std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
Slice key_slices[10]; Slice key_slices[10];
std::string values[10]; std::string values[10];
@ -892,7 +917,7 @@ class StressTest {
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
keys[i] += key.ToString(); keys[i] += key.ToString();
key_slices[i] = keys[i]; key_slices[i] = keys[i];
s = db_->Get(readoptionscopy, key_slices[i], value); s = db_->Get(readoptionscopy, column_family, key_slices[i], value);
if (!s.ok() && !s.IsNotFound()) { if (!s.ok() && !s.IsNotFound()) {
fprintf(stderr, "get error: %s\n", s.ToString().c_str()); fprintf(stderr, "get error: %s\n", s.ToString().c_str());
values[i] = ""; values[i] = "";
@ -937,8 +962,8 @@ class StressTest {
// each series should be the same length, and it is verified for each // each series should be the same length, and it is verified for each
// index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V. // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
// ASSUMES that MultiPut was used to put (K, V) // ASSUMES that MultiPut was used to put (K, V)
Status MultiPrefixScan(ThreadState* thread, Status MultiPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
const ReadOptions& readoptions, ColumnFamilyHandle* column_family,
const Slice& key) { const Slice& key) {
std::string prefixes[10] = {"0", "1", "2", "3", "4", std::string prefixes[10] = {"0", "1", "2", "3", "4",
"5", "6", "7", "8", "9"}; "5", "6", "7", "8", "9"};
@ -954,7 +979,7 @@ class StressTest {
readoptionscopy[i] = readoptions; readoptionscopy[i] = readoptions;
readoptionscopy[i].prefix_seek = true; readoptionscopy[i].prefix_seek = true;
readoptionscopy[i].snapshot = snapshot; readoptionscopy[i].snapshot = snapshot;
iters[i] = db_->NewIterator(readoptionscopy[i]); iters[i] = db_->NewIterator(readoptionscopy[i], column_family);
iters[i]->Seek(prefix_slices[i]); iters[i]->Seek(prefix_slices[i]);
} }
@ -1012,15 +1037,14 @@ class StressTest {
// Given a key K, this creates an iterator which scans to K and then // Given a key K, this creates an iterator which scans to K and then
// does a random sequence of Next/Prev operations. // does a random sequence of Next/Prev operations.
Status MultiIterate(ThreadState* thread, Status MultiIterate(ThreadState* thread, const ReadOptions& readoptions,
const ReadOptions& readoptions, ColumnFamilyHandle* column_family, const Slice& key) {
const Slice& key) {
Status s; Status s;
const Snapshot* snapshot = db_->GetSnapshot(); const Snapshot* snapshot = db_->GetSnapshot();
ReadOptions readoptionscopy = readoptions; ReadOptions readoptionscopy = readoptions;
readoptionscopy.snapshot = snapshot; readoptionscopy.snapshot = snapshot;
readoptionscopy.prefix_seek = FLAGS_prefix_size > 0; readoptionscopy.prefix_seek = FLAGS_prefix_size > 0;
unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy)); unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, column_family));
iter->Seek(key); iter->Seek(key);
for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) { for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
@ -1075,15 +1099,50 @@ class StressTest {
} }
} }
if (!FLAGS_test_batches_snapshots &&
FLAGS_clear_column_family_one_in != 0) {
if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
// drop column family and then create it again (can't drop default)
int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
std::string new_name =
std::to_string(new_column_family_name_.fetch_add(1));
{
MutexLock l(thread->shared->GetMutex());
fprintf(
stdout,
"[CF %d] Dropping and recreating column family. new name: %s\n",
cf, new_name.c_str());
}
thread->shared->LockColumnFamily(cf);
Status s __attribute__((unused));
s = db_->DropColumnFamily(column_families_[cf]);
delete column_families_[cf];
assert(s.ok());
s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
&column_families_[cf]);
column_family_names_[cf] = new_name;
thread->shared->ClearColumnFamily(cf);
assert(s.ok());
thread->shared->UnlockColumnFamily(cf);
}
}
long rand_key = thread->rand.Next() % max_key; long rand_key = thread->rand.Next() % max_key;
int rand_column_family = thread->rand.Next() % FLAGS_column_families;
std::string keystr = Key(rand_key); std::string keystr = Key(rand_key);
Slice key = keystr; Slice key = keystr;
int prob_op = thread->rand.Uniform(100); int prob_op = thread->rand.Uniform(100);
std::unique_ptr<MutexLock> l;
if (!FLAGS_test_batches_snapshots) {
l.reset(new MutexLock(
thread->shared->GetMutexForKey(rand_column_family, rand_key)));
}
auto column_family = column_families_[rand_column_family];
if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) { if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
// OPERATION read // OPERATION read
if (!FLAGS_test_batches_snapshots) { if (!FLAGS_test_batches_snapshots) {
Status s = db_->Get(read_opts, key, &from_db); Status s = db_->Get(read_opts, column_family, key, &from_db);
if (s.ok()) { if (s.ok()) {
// found case // found case
thread->stats.AddGets(1, 1); thread->stats.AddGets(1, 1);
@ -1095,7 +1154,7 @@ class StressTest {
thread->stats.AddErrors(1); thread->stats.AddErrors(1);
} }
} else { } else {
MultiGet(thread, read_opts, key, &from_db); MultiGet(thread, read_opts, column_family, key, &from_db);
} }
} else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) { } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
// OPERATION prefix scan // OPERATION prefix scan
@ -1106,7 +1165,7 @@ class StressTest {
if (!FLAGS_test_batches_snapshots) { if (!FLAGS_test_batches_snapshots) {
Slice prefix = Slice(key.data(), FLAGS_prefix_size); Slice prefix = Slice(key.data(), FLAGS_prefix_size);
read_opts.prefix_seek = true; read_opts.prefix_seek = true;
Iterator* iter = db_->NewIterator(read_opts); Iterator* iter = db_->NewIterator(read_opts, column_family);
int64_t count = 0; int64_t count = 0;
for (iter->Seek(prefix); for (iter->Seek(prefix);
iter->Valid() && iter->key().starts_with(prefix); iter->Next()) { iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
@ -1121,7 +1180,7 @@ class StressTest {
} }
delete iter; delete iter;
} else { } else {
MultiPrefixScan(thread, read_opts, key); MultiPrefixScan(thread, read_opts, column_family, key);
} }
} else if (prefixBound <= prob_op && prob_op < writeBound) { } else if (prefixBound <= prob_op && prob_op < writeBound) {
// OPERATION write // OPERATION write
@ -1129,42 +1188,36 @@ class StressTest {
size_t sz = GenerateValue(value_base, value, sizeof(value)); size_t sz = GenerateValue(value_base, value, sizeof(value));
Slice v(value, sz); Slice v(value, sz);
if (!FLAGS_test_batches_snapshots) { if (!FLAGS_test_batches_snapshots) {
MutexLock l(thread->shared->GetMutexForKey(rand_key));
if (FLAGS_verify_before_write) { if (FLAGS_verify_before_write) {
std::string keystr2 = Key(rand_key); std::string keystr2 = Key(rand_key);
Slice k = keystr2; Slice k = keystr2;
Status s = db_->Get(read_opts, k, &from_db); Status s = db_->Get(read_opts, column_family, k, &from_db);
VerifyValue(rand_key, VerifyValue(rand_column_family, rand_key, read_opts,
read_opts, *(thread->shared), from_db, s, true);
*(thread->shared),
from_db,
s,
true);
} }
thread->shared->Put(rand_key, value_base); thread->shared->Put(rand_column_family, rand_key, value_base);
if (FLAGS_use_merge) { if (FLAGS_use_merge) {
db_->Merge(write_opts, key, v); db_->Merge(write_opts, column_family, key, v);
} else { } else {
db_->Put(write_opts, key, v); db_->Put(write_opts, column_family, key, v);
} }
thread->stats.AddBytesForWrites(1, sz); thread->stats.AddBytesForWrites(1, sz);
} else { } else {
MultiPut(thread, write_opts, key, v, sz); MultiPut(thread, write_opts, column_family, key, v, sz);
} }
PrintKeyValue(rand_key, value, sz); PrintKeyValue(rand_column_family, rand_key, value, sz);
} else if (writeBound <= prob_op && prob_op < delBound) { } else if (writeBound <= prob_op && prob_op < delBound) {
// OPERATION delete // OPERATION delete
if (!FLAGS_test_batches_snapshots) { if (!FLAGS_test_batches_snapshots) {
MutexLock l(thread->shared->GetMutexForKey(rand_key)); thread->shared->Delete(rand_column_family, rand_key);
thread->shared->Delete(rand_key); db_->Delete(write_opts, column_family, key);
db_->Delete(write_opts, key);
thread->stats.AddDeletes(1); thread->stats.AddDeletes(1);
} else { } else {
MultiDelete(thread, write_opts, key); MultiDelete(thread, write_opts, column_family, key);
} }
} else { } else {
// OPERATION iterate // OPERATION iterate
MultiIterate(thread, read_opts, key); MultiIterate(thread, read_opts, column_family, key);
} }
thread->stats.FinishedSingleOp(); thread->stats.FinishedSingleOp();
} }
@ -1182,97 +1235,100 @@ class StressTest {
if (thread->tid == shared.GetNumThreads() - 1) { if (thread->tid == shared.GetNumThreads() - 1) {
end = max_key; end = max_key;
} }
for (size_t cf = 0; cf < column_families_.size(); ++cf) {
if (!thread->rand.OneIn(2)) { if (!thread->rand.OneIn(2)) {
options.prefix_seek = FLAGS_prefix_size > 0; // Use iterator to verify this range
// Use iterator to verify this range options.prefix_seek = FLAGS_prefix_size > 0;
unique_ptr<Iterator> iter(db_->NewIterator(options)); unique_ptr<Iterator> iter(
iter->Seek(Key(start)); db_->NewIterator(options, column_families_[cf]));
for (long i = start; i < end; i++) { iter->Seek(Key(start));
// TODO(ljin): update "long" to uint64_t for (long i = start; i < end; i++) {
// Reseek when the prefix changes // TODO(ljin): update "long" to uint64_t
if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) == 0) { // Reseek when the prefix changes
iter->Seek(Key(i)); if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) ==
} 0) {
std::string from_db; iter->Seek(Key(i));
std::string keystr = Key(i); }
Slice k = keystr; std::string from_db;
Status s = iter->status(); std::string keystr = Key(i);
if (iter->Valid()) { Slice k = keystr;
if (iter->key().compare(k) > 0) { Status s = iter->status();
if (iter->Valid()) {
if (iter->key().compare(k) > 0) {
s = Status::NotFound(Slice());
} else if (iter->key().compare(k) == 0) {
from_db = iter->value().ToString();
iter->Next();
} else if (iter->key().compare(k) < 0) {
VerificationAbort("An out of range key was found", cf, i);
}
} else {
// The iterator found no value for the key in question, so do not
// move to the next item in the iterator
s = Status::NotFound(Slice()); s = Status::NotFound(Slice());
} else if (iter->key().compare(k) == 0) {
from_db = iter->value().ToString();
iter->Next();
} else if (iter->key().compare(k) < 0) {
VerificationAbort("An out of range key was found", i);
} }
} else { VerifyValue(cf, i, options, shared, from_db, s, true);
// The iterator found no value for the key in question, so do not if (from_db.length()) {
// move to the next item in the iterator PrintKeyValue(cf, i, from_db.data(), from_db.length());
s = Status::NotFound(Slice()); }
}
VerifyValue(i, options, shared, from_db, s, true);
if (from_db.length()) {
PrintKeyValue(i, from_db.data(), from_db.length());
} }
} } else {
} else { // Use Get to verify this range
// Use Get to verify this range for (long i = start; i < end; i++) {
for (long i = start; i < end; i++) { std::string from_db;
std::string from_db; std::string keystr = Key(i);
std::string keystr = Key(i); Slice k = keystr;
Slice k = keystr; Status s = db_->Get(options, column_families_[cf], k, &from_db);
Status s = db_->Get(options, k, &from_db); VerifyValue(cf, i, options, shared, from_db, s, true);
VerifyValue(i, options, shared, from_db, s, true); if (from_db.length()) {
if (from_db.length()) { PrintKeyValue(cf, i, from_db.data(), from_db.length());
PrintKeyValue(i, from_db.data(), from_db.length()); }
} }
} }
} }
} }
void VerificationAbort(std::string msg, long key) const { void VerificationAbort(std::string msg, int cf, long key) const {
fprintf(stderr, "Verification failed for key %ld: %s\n", fprintf(stderr, "Verification failed for column family %d key %ld: %s\n",
key, msg.c_str()); cf, key, msg.c_str());
exit(1); exit(1);
} }
void VerifyValue(long key, void VerifyValue(int cf, long key, const ReadOptions& opts,
const ReadOptions &opts, const SharedState& shared, const std::string& value_from_db,
const SharedState &shared, Status s, bool strict = false) const {
const std::string &value_from_db,
Status s,
bool strict=false) const {
// compare value_from_db with the value in the shared state // compare value_from_db with the value in the shared state
char value[100]; char value[100];
uint32_t value_base = shared.Get(key); uint32_t value_base = shared.Get(cf, key);
if (value_base == SharedState::SENTINEL && !strict) { if (value_base == SharedState::SENTINEL && !strict) {
return; return;
} }
if (s.ok()) { if (s.ok()) {
if (value_base == SharedState::SENTINEL) { if (value_base == SharedState::SENTINEL) {
VerificationAbort("Unexpected value found", key); VerificationAbort("Unexpected value found", cf, key);
} }
size_t sz = GenerateValue(value_base, value, sizeof(value)); size_t sz = GenerateValue(value_base, value, sizeof(value));
if (value_from_db.length() != sz) { if (value_from_db.length() != sz) {
VerificationAbort("Length of value read is not equal", key); VerificationAbort("Length of value read is not equal", cf, key);
} }
if (memcmp(value_from_db.data(), value, sz) != 0) { if (memcmp(value_from_db.data(), value, sz) != 0) {
VerificationAbort("Contents of value read don't match", key); VerificationAbort("Contents of value read don't match", cf, key);
} }
} else { } else {
if (value_base != SharedState::SENTINEL) { if (value_base != SharedState::SENTINEL) {
VerificationAbort("Value not found", key); VerificationAbort("Value not found", cf, key);
} }
} }
} }
static void PrintKeyValue(uint32_t key, const char *value, size_t sz) { static void PrintKeyValue(int cf, uint32_t key, const char* value,
if (!FLAGS_verbose) return; size_t sz) {
fprintf(stdout, "%u ==> (%u) ", key, (unsigned int)sz); if (!FLAGS_verbose) {
for (size_t i=0; i<sz; i++) { return;
}
fprintf(stdout, "[CF %d] %u ==> (%u) ", cf, key, (unsigned int)sz);
for (size_t i = 0; i < sz; i++) {
fprintf(stdout, "%X", value[i]); fprintf(stdout, "%X", value[i]);
} }
fprintf(stdout, "\n"); fprintf(stdout, "\n");
@ -1290,8 +1346,13 @@ class StressTest {
} }
void PrintEnv() const { void PrintEnv() const {
fprintf(stdout, "LevelDB version : %d.%d\n", fprintf(stdout, "RocksDB version : %d.%d\n", kMajorVersion,
kMajorVersion, kMinorVersion); kMinorVersion);
fprintf(stdout, "Column families : %d\n", FLAGS_column_families);
if (!FLAGS_test_batches_snapshots) {
fprintf(stdout, "Clear CFs one in : %d\n",
FLAGS_clear_column_family_one_in);
}
fprintf(stdout, "Number of threads : %d\n", FLAGS_threads); fprintf(stdout, "Number of threads : %d\n", FLAGS_threads);
fprintf(stdout, fprintf(stdout,
"Ops per thread : %lu\n", "Ops per thread : %lu\n",
@ -1368,43 +1429,41 @@ class StressTest {
void Open() { void Open() {
assert(db_ == nullptr); assert(db_ == nullptr);
Options options; options_.block_cache = cache_;
options.block_cache = cache_; options_.block_cache_compressed = compressed_cache_;
options.block_cache_compressed = compressed_cache_; options_.write_buffer_size = FLAGS_write_buffer_size;
options.write_buffer_size = FLAGS_write_buffer_size; options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
options.max_write_buffer_number = FLAGS_max_write_buffer_number; options_.min_write_buffer_number_to_merge =
options.min_write_buffer_number_to_merge = FLAGS_min_write_buffer_number_to_merge;
FLAGS_min_write_buffer_number_to_merge; options_.max_background_compactions = FLAGS_max_background_compactions;
options.max_background_compactions = FLAGS_max_background_compactions; options_.max_background_flushes = FLAGS_max_background_flushes;
options.compaction_style = options_.compaction_style =
static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style); static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
options.block_size = FLAGS_block_size; options_.block_size = FLAGS_block_size;
options.filter_policy = filter_policy_; options_.filter_policy = filter_policy_;
options.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size)); options_.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size));
options.max_open_files = FLAGS_open_files; options_.max_open_files = FLAGS_open_files;
options.statistics = dbstats; options_.statistics = dbstats;
options.env = FLAGS_env; options_.env = FLAGS_env;
options.disableDataSync = FLAGS_disable_data_sync; options_.disableDataSync = FLAGS_disable_data_sync;
options.use_fsync = FLAGS_use_fsync; options_.use_fsync = FLAGS_use_fsync;
options.allow_mmap_reads = FLAGS_mmap_read; options_.allow_mmap_reads = FLAGS_mmap_read;
rocksdb_kill_odds = FLAGS_kill_random_test; rocksdb_kill_odds = FLAGS_kill_random_test;
options.target_file_size_base = FLAGS_target_file_size_base; options_.target_file_size_base = FLAGS_target_file_size_base;
options.target_file_size_multiplier = FLAGS_target_file_size_multiplier; options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
options.max_bytes_for_level_multiplier = options_.max_bytes_for_level_multiplier =
FLAGS_max_bytes_for_level_multiplier; FLAGS_max_bytes_for_level_multiplier;
options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger; options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
options.level0_slowdown_writes_trigger = options_.level0_slowdown_writes_trigger =
FLAGS_level0_slowdown_writes_trigger; FLAGS_level0_slowdown_writes_trigger;
options.level0_file_num_compaction_trigger = options_.level0_file_num_compaction_trigger =
FLAGS_level0_file_num_compaction_trigger; FLAGS_level0_file_num_compaction_trigger;
options.compression = FLAGS_compression_type_e; options_.compression = FLAGS_compression_type_e;
options.create_if_missing = true; options_.create_if_missing = true;
options.disable_seek_compaction = FLAGS_disable_seek_compaction; options_.disable_seek_compaction = FLAGS_disable_seek_compaction;
options.delete_obsolete_files_period_micros = options_.max_manifest_file_size = 10 * 1024;
FLAGS_delete_obsolete_files_period_micros; options_.filter_deletes = FLAGS_filter_deletes;
options.max_manifest_file_size = 1024;
options.filter_deletes = FLAGS_filter_deletes;
if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) { if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) {
fprintf(stderr, fprintf(stderr,
"prefix_size should be non-zero iff memtablerep == prefix_hash\n"); "prefix_size should be non-zero iff memtablerep == prefix_hash\n");
@ -1412,51 +1471,107 @@ class StressTest {
} }
switch (FLAGS_rep_factory) { switch (FLAGS_rep_factory) {
case kHashSkipList: case kHashSkipList:
options.memtable_factory.reset(NewHashSkipListRepFactory()); options_.memtable_factory.reset(NewHashSkipListRepFactory());
break; break;
case kSkipList: case kSkipList:
// no need to do anything // no need to do anything
break; break;
case kVectorRep: case kVectorRep:
options.memtable_factory.reset(new VectorRepFactory()); options_.memtable_factory.reset(new VectorRepFactory());
break; break;
} }
static Random purge_percent(1000); // no benefit from non-determinism here static Random purge_percent(1000); // no benefit from non-determinism here
if (static_cast<int32_t>(purge_percent.Uniform(100)) < if (static_cast<int32_t>(purge_percent.Uniform(100)) <
FLAGS_purge_redundant_percent - 1) { FLAGS_purge_redundant_percent - 1) {
options.purge_redundant_kvs_while_flush = false; options_.purge_redundant_kvs_while_flush = false;
} }
if (FLAGS_use_merge) { if (FLAGS_use_merge) {
options.merge_operator = MergeOperators::CreatePutOperator(); options_.merge_operator = MergeOperators::CreatePutOperator();
} }
// set universal style compaction configurations, if applicable // set universal style compaction configurations, if applicable
if (FLAGS_universal_size_ratio != 0) { if (FLAGS_universal_size_ratio != 0) {
options.compaction_options_universal.size_ratio = options_.compaction_options_universal.size_ratio =
FLAGS_universal_size_ratio; FLAGS_universal_size_ratio;
} }
if (FLAGS_universal_min_merge_width != 0) { if (FLAGS_universal_min_merge_width != 0) {
options.compaction_options_universal.min_merge_width = options_.compaction_options_universal.min_merge_width =
FLAGS_universal_min_merge_width; FLAGS_universal_min_merge_width;
} }
if (FLAGS_universal_max_merge_width != 0) { if (FLAGS_universal_max_merge_width != 0) {
options.compaction_options_universal.max_merge_width = options_.compaction_options_universal.max_merge_width =
FLAGS_universal_max_merge_width; FLAGS_universal_max_merge_width;
} }
if (FLAGS_universal_max_size_amplification_percent != 0) { if (FLAGS_universal_max_size_amplification_percent != 0) {
options.compaction_options_universal.max_size_amplification_percent = options_.compaction_options_universal.max_size_amplification_percent =
FLAGS_universal_max_size_amplification_percent; FLAGS_universal_max_size_amplification_percent;
} }
fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
Status s; Status s;
if (FLAGS_ttl == -1) { if (FLAGS_ttl == -1) {
s = DB::Open(options, FLAGS_db, &db_); std::vector<std::string> existing_column_families;
s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
&existing_column_families); // ignore errors
if (!s.ok()) {
// DB doesn't exist
assert(existing_column_families.empty());
assert(column_family_names_.empty());
column_family_names_.push_back(default_column_family_name);
} else if (column_family_names_.empty()) {
// this is the first call to the function Open()
column_family_names_ = existing_column_families;
} else {
// this is a reopen. just assert that existing column_family_names are
// equivalent to what we remember
auto sorted_cfn = column_family_names_;
sort(sorted_cfn.begin(), sorted_cfn.end());
sort(existing_column_families.begin(), existing_column_families.end());
if (sorted_cfn != existing_column_families) {
fprintf(stderr,
"Expected column families differ from the existing:\n");
printf("Expected: {");
for (auto cf : sorted_cfn) {
printf("%s ", cf.c_str());
}
printf("}\n");
printf("Existing: {");
for (auto cf : existing_column_families) {
printf("%s ", cf.c_str());
}
printf("}\n");
}
assert(sorted_cfn == existing_column_families);
}
std::vector<ColumnFamilyDescriptor> cf_descriptors;
for (auto name : column_family_names_) {
if (name != default_column_family_name) {
new_column_family_name_ =
std::max(new_column_family_name_.load(), std::stoi(name) + 1);
}
cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
}
s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
&column_families_, &db_);
if (s.ok()) {
while (s.ok() &&
column_families_.size() < (size_t)FLAGS_column_families) {
ColumnFamilyHandle* cf = nullptr;
std::string name = std::to_string(new_column_family_name_.load());
new_column_family_name_++;
s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), name, &cf);
column_families_.push_back(cf);
column_family_names_.push_back(name);
}
}
assert(!s.ok() || column_families_.size() ==
static_cast<size_t>(FLAGS_column_families));
} else { } else {
s = UtilityDB::OpenTtlDB(options, FLAGS_db, &sdb_, FLAGS_ttl); StackableDB* sdb;
db_ = sdb_; s = UtilityDB::OpenTtlDB(options_, FLAGS_db, &sdb, FLAGS_ttl);
db_ = sdb;
} }
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "open error: %s\n", s.ToString().c_str()); fprintf(stderr, "open error: %s\n", s.ToString().c_str());
@ -1465,13 +1580,11 @@ class StressTest {
} }
void Reopen() { void Reopen() {
// do not close the db. Just delete the lock file. This for (auto cf : column_families_) {
// simulates a crash-recovery kind of situation. delete cf;
if (FLAGS_ttl != -1) {
((DBWithTTL*) db_)->TEST_Destroy_DBWithTtl();
} else {
((DBImpl*) db_)->TEST_Destroy_DBImpl();
} }
column_families_.clear();
delete db_;
db_ = nullptr; db_ = nullptr;
num_times_reopened_++; num_times_reopened_++;
@ -1493,14 +1606,15 @@ class StressTest {
shared_ptr<Cache> compressed_cache_; shared_ptr<Cache> compressed_cache_;
const FilterPolicy* filter_policy_; const FilterPolicy* filter_policy_;
DB* db_; DB* db_;
StackableDB* sdb_; Options options_;
std::vector<ColumnFamilyHandle*> column_families_;
std::vector<std::string> column_family_names_;
std::atomic<int> new_column_family_name_;
int num_times_reopened_; int num_times_reopened_;
}; };
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {
google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
" [OPTIONS]..."); " [OPTIONS]...");

@ -81,7 +81,7 @@ Status CreateLoggerFromOptions(
const std::string& dbname, const std::string& dbname,
const std::string& db_log_dir, const std::string& db_log_dir,
Env* env, Env* env,
const Options& options, const DBOptions& options,
std::shared_ptr<Logger>* logger) { std::shared_ptr<Logger>* logger) {
std::string db_absolute_path; std::string db_absolute_path;
env->GetAbsolutePath(dbname, &db_absolute_path); env->GetAbsolutePath(dbname, &db_absolute_path);

@ -85,7 +85,7 @@ Status CreateLoggerFromOptions(
const std::string& dbname, const std::string& dbname,
const std::string& db_log_dir, const std::string& db_log_dir,
Env* env, Env* env,
const Options& options, const DBOptions& options,
std::shared_ptr<Logger>* logger); std::shared_ptr<Logger>* logger);
} // namespace rocksdb } // namespace rocksdb

@ -197,7 +197,7 @@ TEST(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
} }
TEST(AutoRollLoggerTest, CreateLoggerFromOptions) { TEST(AutoRollLoggerTest, CreateLoggerFromOptions) {
Options options; DBOptions options;
shared_ptr<Logger> logger; shared_ptr<Logger> logger;
// Normal logger // Normal logger

@ -314,24 +314,12 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
} }
static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) { static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
#ifdef __SSE4_2__ #ifdef __SSE4_2__
*l = _mm_crc32_u64(*l, LE_LOAD64(*p)); *l = _mm_crc32_u64(*l, LE_LOAD64(*p));
*p += 8; *p += 8;
#else #else
Slow_CRC32(l, p); Slow_CRC32(l, p);
#endif #endif
}
// Detect if SS42 or not.
static bool isSSE42() {
#ifdef __GNUC__
uint32_t c_;
uint32_t d_;
__asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
return c_ & (1U << 20); // copied from CpuId.h in Folly.
#else
return false;
#endif
} }
template<void (*CRC32)(uint64_t*, uint8_t const**)> template<void (*CRC32)(uint64_t*, uint8_t const**)>
@ -377,6 +365,18 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
return l ^ 0xffffffffu; return l ^ 0xffffffffu;
} }
// Detect if SS42 or not.
static bool isSSE42() {
#if defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
uint32_t c_;
uint32_t d_;
__asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
return c_ & (1U << 20); // copied from CpuId.h in Folly.
#else
return false;
#endif
}
typedef uint32_t (*Function)(uint32_t, const char*, size_t); typedef uint32_t (*Function)(uint32_t, const char*, size_t);
static inline Function Choose_Extend() { static inline Function Choose_Extend() {

@ -3,6 +3,8 @@
// LICENSE file in the root directory of this source tree. An additional grant // LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory. // of patent rights can be found in the PATENTS file in the same directory.
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <algorithm> #include <algorithm>
#include <gflags/gflags.h> #include <gflags/gflags.h>
@ -74,11 +76,12 @@ TEST(DynamicBloomTest, VaryingLengths) {
// Count number of filters that significantly exceed the false positive rate // Count number of filters that significantly exceed the false positive rate
int mediocre_filters = 0; int mediocre_filters = 0;
int good_filters = 0; int good_filters = 0;
uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
fprintf(stderr, "bits_per_key: %d num_probes: %d\n", fprintf(stderr, "bits_per_key: %d num_probes: %d\n",
FLAGS_bits_per_key, FLAGS_num_probes); FLAGS_bits_per_key, num_probes);
for (uint32_t cl_per_block = 0; cl_per_block < FLAGS_num_probes; for (uint32_t cl_per_block = 0; cl_per_block < num_probes;
++cl_per_block) { ++cl_per_block) {
for (uint32_t num = 1; num <= 10000; num = NextNum(num)) { for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
uint32_t bloom_bits = 0; uint32_t bloom_bits = 0;
@ -88,7 +91,7 @@ TEST(DynamicBloomTest, VaryingLengths) {
bloom_bits = std::max(num * FLAGS_bits_per_key, bloom_bits = std::max(num * FLAGS_bits_per_key,
cl_per_block * CACHE_LINE_SIZE * 8); cl_per_block * CACHE_LINE_SIZE * 8);
} }
DynamicBloom bloom(bloom_bits, cl_per_block, FLAGS_num_probes); DynamicBloom bloom(bloom_bits, cl_per_block, num_probes);
for (uint64_t i = 0; i < num; i++) { for (uint64_t i = 0; i < num; i++) {
bloom.Add(Key(i, buffer)); bloom.Add(Key(i, buffer));
ASSERT_TRUE(bloom.MayContain(Key(i, buffer))); ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
@ -127,6 +130,7 @@ TEST(DynamicBloomTest, VaryingLengths) {
TEST(DynamicBloomTest, perf) { TEST(DynamicBloomTest, perf) {
StopWatchNano timer(Env::Default()); StopWatchNano timer(Env::Default());
uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
if (!FLAGS_enable_perf) { if (!FLAGS_enable_perf) {
return; return;
@ -134,9 +138,9 @@ TEST(DynamicBloomTest, perf) {
for (uint64_t m = 1; m <= 8; ++m) { for (uint64_t m = 1; m <= 8; ++m) {
const uint64_t num_keys = m * 8 * 1024 * 1024; const uint64_t num_keys = m * 8 * 1024 * 1024;
fprintf(stderr, "testing %luM keys\n", m * 8); fprintf(stderr, "testing %" PRIu64 "M keys\n", m * 8);
DynamicBloom std_bloom(num_keys * 10, 0, FLAGS_num_probes); DynamicBloom std_bloom(num_keys * 10, 0, num_probes);
timer.Start(); timer.Start();
for (uint64_t i = 1; i <= num_keys; ++i) { for (uint64_t i = 1; i <= num_keys; ++i) {
@ -144,7 +148,7 @@ TEST(DynamicBloomTest, perf) {
} }
uint64_t elapsed = timer.ElapsedNanos(); uint64_t elapsed = timer.ElapsedNanos();
fprintf(stderr, "standard bloom, avg add latency %lu\n", fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n",
elapsed / num_keys); elapsed / num_keys);
uint64_t count = 0; uint64_t count = 0;
@ -155,13 +159,13 @@ TEST(DynamicBloomTest, perf) {
} }
} }
elapsed = timer.ElapsedNanos(); elapsed = timer.ElapsedNanos();
fprintf(stderr, "standard bloom, avg query latency %lu\n", fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n",
elapsed / count); elapsed / count);
ASSERT_TRUE(count == num_keys); ASSERT_TRUE(count == num_keys);
for (int cl_per_block = 1; cl_per_block <= FLAGS_num_probes; for (uint32_t cl_per_block = 1; cl_per_block <= num_probes;
++cl_per_block) { ++cl_per_block) {
DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, FLAGS_num_probes); DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, num_probes);
timer.Start(); timer.Start();
for (uint64_t i = 1; i <= num_keys; ++i) { for (uint64_t i = 1; i <= num_keys; ++i) {
@ -169,7 +173,7 @@ TEST(DynamicBloomTest, perf) {
} }
uint64_t elapsed = timer.ElapsedNanos(); uint64_t elapsed = timer.ElapsedNanos();
fprintf(stderr, "blocked bloom(%d), avg add latency %lu\n", fprintf(stderr, "blocked bloom(%d), avg add latency %" PRIu64 "\n",
cl_per_block, elapsed / num_keys); cl_per_block, elapsed / num_keys);
uint64_t count = 0; uint64_t count = 0;
@ -182,7 +186,7 @@ TEST(DynamicBloomTest, perf) {
} }
elapsed = timer.ElapsedNanos(); elapsed = timer.ElapsedNanos();
fprintf(stderr, "blocked bloom(%d), avg query latency %lu\n", fprintf(stderr, "blocked bloom(%d), avg query latency %" PRIu64 "\n",
cl_per_block, elapsed / count); cl_per_block, elapsed / count);
ASSERT_TRUE(count == num_keys); ASSERT_TRUE(count == num_keys);
} }

@ -231,7 +231,7 @@ EnvWrapper::~EnvWrapper() {
namespace { // anonymous namespace namespace { // anonymous namespace
void AssignEnvOptions(EnvOptions* env_options, const Options& options) { void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
env_options->use_os_buffer = options.allow_os_buffer; env_options->use_os_buffer = options.allow_os_buffer;
env_options->use_mmap_reads = options.allow_mmap_reads; env_options->use_mmap_reads = options.allow_mmap_reads;
env_options->use_mmap_writes = options.allow_mmap_writes; env_options->use_mmap_writes = options.allow_mmap_writes;
@ -249,12 +249,12 @@ EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const {
return env_options; return env_options;
} }
EnvOptions::EnvOptions(const Options& options) { EnvOptions::EnvOptions(const DBOptions& options) {
AssignEnvOptions(this, options); AssignEnvOptions(this, options);
} }
EnvOptions::EnvOptions() { EnvOptions::EnvOptions() {
Options options; DBOptions options;
AssignEnvOptions(this, options); AssignEnvOptions(this, options);
} }

@ -22,12 +22,6 @@ namespace {
typedef const char* Key; typedef const char* Key;
struct Node { struct Node {
explicit Node(const Key& k) :
key(k) {
}
Key const key;
// Accessors/mutators for links. Wrapped in methods so we can // Accessors/mutators for links. Wrapped in methods so we can
// add the appropriate barriers as necessary. // add the appropriate barriers as necessary.
Node* Next() { Node* Next() {
@ -40,17 +34,19 @@ struct Node {
// pointer observes a fully initialized version of the inserted node. // pointer observes a fully initialized version of the inserted node.
next_.Release_Store(x); next_.Release_Store(x);
} }
// No-barrier variants that can be safely used in a few locations. // No-barrier variants that can be safely used in a few locations.
Node* NoBarrier_Next() { Node* NoBarrier_Next() {
return reinterpret_cast<Node*>(next_.NoBarrier_Load()); return reinterpret_cast<Node*>(next_.NoBarrier_Load());
} }
void NoBarrier_SetNext(Node* x) { void NoBarrier_SetNext(Node* x) {
next_.NoBarrier_Store(x); next_.NoBarrier_Store(x);
} }
private: private:
port::AtomicPointer next_; port::AtomicPointer next_;
public:
char key[0];
}; };
class HashLinkListRep : public MemTableRep { class HashLinkListRep : public MemTableRep {
@ -58,7 +54,9 @@ class HashLinkListRep : public MemTableRep {
HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena, HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
const SliceTransform* transform, size_t bucket_size); const SliceTransform* transform, size_t bucket_size);
virtual void Insert(const char* key) override; virtual KeyHandle Allocate(const size_t len, char** buf) override;
virtual void Insert(KeyHandle handle) override;
virtual bool Contains(const char* key) const override; virtual bool Contains(const char* key) const override;
@ -93,8 +91,6 @@ class HashLinkListRep : public MemTableRep {
const SliceTransform* transform_; const SliceTransform* transform_;
const MemTableRep::KeyComparator& compare_; const MemTableRep::KeyComparator& compare_;
// immutable after construction
Arena* const arena_;
bool BucketContains(Node* head, const Slice& key) const; bool BucketContains(Node* head, const Slice& key) const;
@ -114,11 +110,6 @@ class HashLinkListRep : public MemTableRep {
return GetBucket(GetHash(slice)); return GetBucket(GetHash(slice));
} }
Node* NewNode(const Key& key) {
char* mem = arena_->AllocateAligned(sizeof(Node));
return new (mem) Node(key);
}
bool Equal(const Slice& a, const Key& b) const { bool Equal(const Slice& a, const Key& b) const {
return (compare_(b, a) == 0); return (compare_(b, a) == 0);
} }
@ -318,10 +309,10 @@ class HashLinkListRep : public MemTableRep {
HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare, HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
Arena* arena, const SliceTransform* transform, Arena* arena, const SliceTransform* transform,
size_t bucket_size) size_t bucket_size)
: bucket_size_(bucket_size), : MemTableRep(arena),
bucket_size_(bucket_size),
transform_(transform), transform_(transform),
compare_(compare), compare_(compare) {
arena_(arena) {
char* mem = arena_->AllocateAligned( char* mem = arena_->AllocateAligned(
sizeof(port::AtomicPointer) * bucket_size); sizeof(port::AtomicPointer) * bucket_size);
@ -335,15 +326,22 @@ HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
HashLinkListRep::~HashLinkListRep() { HashLinkListRep::~HashLinkListRep() {
} }
void HashLinkListRep::Insert(const char* key) { KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
assert(!Contains(key)); char* mem = arena_->AllocateAligned(sizeof(Node) + len);
Slice internal_key = GetLengthPrefixedSlice(key); Node* x = new (mem) Node();
*buf = x->key;
return static_cast<void*>(x);
}
void HashLinkListRep::Insert(KeyHandle handle) {
Node* x = static_cast<Node*>(handle);
assert(!Contains(x->key));
Slice internal_key = GetLengthPrefixedSlice(x->key);
auto transformed = GetPrefix(internal_key); auto transformed = GetPrefix(internal_key);
auto& bucket = buckets_[GetHash(transformed)]; auto& bucket = buckets_[GetHash(transformed)];
Node* head = static_cast<Node*>(bucket.Acquire_Load()); Node* head = static_cast<Node*>(bucket.Acquire_Load());
if (!head) { if (!head) {
Node* x = NewNode(key);
// NoBarrier_SetNext() suffices since we will add a barrier when // NoBarrier_SetNext() suffices since we will add a barrier when
// we publish a pointer to "x" in prev[i]. // we publish a pointer to "x" in prev[i].
x->NoBarrier_SetNext(nullptr); x->NoBarrier_SetNext(nullptr);
@ -372,9 +370,7 @@ void HashLinkListRep::Insert(const char* key) {
} }
// Our data structure does not allow duplicate insertion // Our data structure does not allow duplicate insertion
assert(cur == nullptr || !Equal(key, cur->key)); assert(cur == nullptr || !Equal(x->key, cur->key));
Node* x = NewNode(key);
// NoBarrier_SetNext() suffices since we will add a barrier when // NoBarrier_SetNext() suffices since we will add a barrier when
// we publish a pointer to "x" in prev[i]. // we publish a pointer to "x" in prev[i].

@ -25,7 +25,7 @@ class HashSkipListRep : public MemTableRep {
const SliceTransform* transform, size_t bucket_size, const SliceTransform* transform, size_t bucket_size,
int32_t skiplist_height, int32_t skiplist_branching_factor); int32_t skiplist_height, int32_t skiplist_branching_factor);
virtual void Insert(const char* key) override; virtual void Insert(KeyHandle handle) override;
virtual bool Contains(const char* key) const override; virtual bool Contains(const char* key) const override;
@ -225,7 +225,8 @@ HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare,
Arena* arena, const SliceTransform* transform, Arena* arena, const SliceTransform* transform,
size_t bucket_size, int32_t skiplist_height, size_t bucket_size, int32_t skiplist_height,
int32_t skiplist_branching_factor) int32_t skiplist_branching_factor)
: bucket_size_(bucket_size), : MemTableRep(arena),
bucket_size_(bucket_size),
skiplist_height_(skiplist_height), skiplist_height_(skiplist_height),
skiplist_branching_factor_(skiplist_branching_factor), skiplist_branching_factor_(skiplist_branching_factor),
transform_(transform), transform_(transform),
@ -255,7 +256,8 @@ HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
return bucket; return bucket;
} }
void HashSkipListRep::Insert(const char* key) { void HashSkipListRep::Insert(KeyHandle handle) {
auto* key = static_cast<char*>(handle);
assert(!Contains(key)); assert(!Contains(key));
auto transformed = transform_->Transform(UserKey(key)); auto transformed = transform_->Transform(UserKey(key));
auto bucket = GetInitializedBucket(transformed); auto bucket = GetInitializedBucket(transformed);

@ -11,6 +11,7 @@
#include "db/filename.h" #include "db/filename.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "rocksdb/write_batch.h" #include "rocksdb/write_batch.h"
#include "rocksdb/cache.h"
#include "util/coding.h" #include "util/coding.h"
#include <ctime> #include <ctime>
@ -152,6 +153,8 @@ LDBCommand* LDBCommand::SelectCommand(
return new DBLoaderCommand(cmdParams, option_map, flags); return new DBLoaderCommand(cmdParams, option_map, flags);
} else if (cmd == ManifestDumpCommand::Name()) { } else if (cmd == ManifestDumpCommand::Name()) {
return new ManifestDumpCommand(cmdParams, option_map, flags); return new ManifestDumpCommand(cmdParams, option_map, flags);
} else if (cmd == ListColumnFamiliesCommand::Name()) {
return new ListColumnFamiliesCommand(cmdParams, option_map, flags);
} else if (cmd == InternalDumpCommand::Name()) { } else if (cmd == InternalDumpCommand::Name()) {
return new InternalDumpCommand(cmdParams, option_map, flags); return new InternalDumpCommand(cmdParams, option_map, flags);
} else if (cmd == CheckConsistencyCommand::Name()) { } else if (cmd == CheckConsistencyCommand::Name()) {
@ -540,11 +543,10 @@ void ManifestDumpCommand::DoCommand() {
EnvOptions sopt; EnvOptions sopt;
std::string file(manifestfile); std::string file(manifestfile);
std::string dbname("dummy"); std::string dbname("dummy");
TableCache* tc = new TableCache(dbname, &options, sopt, 10); std::shared_ptr<Cache> tc(NewLRUCache(
const InternalKeyComparator* cmp = options.max_open_files - 10, options.table_cache_numshardbits,
new InternalKeyComparator(options.comparator); options.table_cache_remove_scan_count_limit));
VersionSet* versions = new VersionSet(dbname, &options, sopt, tc.get());
VersionSet* versions = new VersionSet(dbname, &options, sopt, tc, cmp);
Status s = versions->DumpManifest(options, file, verbose_, is_key_hex_); Status s = versions->DumpManifest(options, file, verbose_, is_key_hex_);
if (!s.ok()) { if (!s.ok()) {
printf("Error in processing file %s %s\n", manifestfile.c_str(), printf("Error in processing file %s %s\n", manifestfile.c_str(),
@ -557,6 +559,48 @@ void ManifestDumpCommand::DoCommand() {
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
void ListColumnFamiliesCommand::Help(string& ret) {
ret.append(" ");
ret.append(ListColumnFamiliesCommand::Name());
ret.append(" full_path_to_db_directory ");
ret.append("\n");
}
ListColumnFamiliesCommand::ListColumnFamiliesCommand(
const vector<string>& params, const map<string, string>& options,
const vector<string>& flags)
: LDBCommand(options, flags, false, {}) {
if (params.size() != 1) {
exec_state_ = LDBCommandExecuteResult::FAILED(
"dbname must be specified for the list_column_families command");
} else {
dbname_ = params[0];
}
}
void ListColumnFamiliesCommand::DoCommand() {
vector<string> column_families;
Status s = DB::ListColumnFamilies(DBOptions(), dbname_, &column_families);
if (!s.ok()) {
printf("Error in processing db %s %s\n", dbname_.c_str(),
s.ToString().c_str());
} else {
printf("Column families in %s: \n{", dbname_.c_str());
bool first = true;
for (auto cf : column_families) {
if (!first) {
printf(", ");
}
first = false;
printf("%s", cf.c_str());
}
printf("}\n");
}
}
// ----------------------------------------------------------------------------
string ReadableTime(int unixtime) { string ReadableTime(int unixtime) {
char time_buffer [80]; char time_buffer [80];
time_t rawtime = unixtime; time_t rawtime = unixtime;
@ -1018,19 +1062,26 @@ Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() {
Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
int* levels) { int* levels) {
EnvOptions soptions; EnvOptions soptions;
TableCache tc(db_path_, &opt, soptions, 10); std::shared_ptr<Cache> tc(
NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits,
opt.table_cache_remove_scan_count_limit));
const InternalKeyComparator cmp(opt.comparator); const InternalKeyComparator cmp(opt.comparator);
VersionSet versions(db_path_, &opt, soptions, &tc, &cmp); VersionSet versions(db_path_, &opt, soptions, tc.get());
std::vector<ColumnFamilyDescriptor> dummy;
ColumnFamilyDescriptor dummy_descriptor(default_column_family_name,
ColumnFamilyOptions(opt));
dummy.push_back(dummy_descriptor);
// We rely the VersionSet::Recover to tell us the internal data structures // We rely the VersionSet::Recover to tell us the internal data structures
// in the db. And the Recover() should never do any change // in the db. And the Recover() should never do any change
// (like LogAndApply) to the manifest file. // (like LogAndApply) to the manifest file.
Status st = versions.Recover(); Status st = versions.Recover(dummy);
if (!st.ok()) { if (!st.ok()) {
return st; return st;
} }
int max = -1; int max = -1;
for (int i = 0; i < versions.NumberLevels(); i++) { auto default_cfd = versions.GetColumnFamilySet()->GetDefault();
if (versions.current()->NumLevelFiles(i)) { for (int i = 0; i < default_cfd->NumberLevels(); i++) {
if (default_cfd->current()->NumLevelFiles(i)) {
max = i; max = i;
} }
} }
@ -1075,7 +1126,6 @@ void ReduceDBLevelsCommand::DoCommand() {
CloseDB(); CloseDB();
EnvOptions soptions; EnvOptions soptions;
st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_); st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_);
if (!st.ok()) { if (!st.ok()) {
exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());

@ -484,6 +484,23 @@ private:
static const string ARG_PATH; static const string ARG_PATH;
}; };
class ListColumnFamiliesCommand : public LDBCommand {
public:
static string Name() { return "list_column_families"; }
ListColumnFamiliesCommand(const vector<string>& params,
const map<string, string>& options,
const vector<string>& flags);
static void Help(string& ret);
virtual void DoCommand();
virtual bool NoDBOpen() { return true; }
private:
string dbname_;
};
class ReduceDBLevelsCommand : public LDBCommand { class ReduceDBLevelsCommand : public LDBCommand {
public: public:
static string Name() { return "reduce_levels"; } static string Name() { return "reduce_levels"; }

@ -64,6 +64,7 @@ public:
DBDumperCommand::Help(ret); DBDumperCommand::Help(ret);
DBLoaderCommand::Help(ret); DBLoaderCommand::Help(ret);
ManifestDumpCommand::Help(ret); ManifestDumpCommand::Help(ret);
ListColumnFamiliesCommand::Help(ret);
InternalDumpCommand::Help(ret); InternalDumpCommand::Help(ret);
fprintf(stderr, "%s\n", ret.c_str()); fprintf(stderr, "%s\n", ret.c_str());

@ -26,23 +26,17 @@
namespace rocksdb { namespace rocksdb {
Options::Options() ColumnFamilyOptions::ColumnFamilyOptions()
: comparator(BytewiseComparator()), : comparator(BytewiseComparator()),
merge_operator(nullptr), merge_operator(nullptr),
compaction_filter(nullptr), compaction_filter(nullptr),
compaction_filter_factory(std::shared_ptr<CompactionFilterFactory>( compaction_filter_factory(std::shared_ptr<CompactionFilterFactory>(
new DefaultCompactionFilterFactory())), new DefaultCompactionFilterFactory())),
compaction_filter_factory_v2(new DefaultCompactionFilterFactoryV2()), compaction_filter_factory_v2(
create_if_missing(false), new DefaultCompactionFilterFactoryV2()),
error_if_exists(false),
paranoid_checks(true),
env(Env::Default()),
info_log(nullptr),
info_log_level(INFO),
write_buffer_size(4 << 20), write_buffer_size(4 << 20),
max_write_buffer_number(2), max_write_buffer_number(2),
min_write_buffer_number_to_merge(1), min_write_buffer_number_to_merge(1),
max_open_files(5000),
block_cache(nullptr), block_cache(nullptr),
block_cache_compressed(nullptr), block_cache_compressed(nullptr),
block_size(4096), block_size(4096),
@ -64,88 +58,252 @@ Options::Options()
expanded_compaction_factor(25), expanded_compaction_factor(25),
source_compaction_factor(1), source_compaction_factor(1),
max_grandparent_overlap_factor(10), max_grandparent_overlap_factor(10),
disable_seek_compaction(true),
soft_rate_limit(0.0),
hard_rate_limit(0.0),
rate_limit_delay_max_milliseconds(1000),
no_block_cache(false),
arena_block_size(0),
disable_auto_compactions(false),
purge_redundant_kvs_while_flush(true),
block_size_deviation(10),
compaction_style(kCompactionStyleLevel),
verify_checksums_in_compaction(true),
filter_deletes(false),
max_sequential_skip_in_iterations(8),
memtable_factory(std::shared_ptr<SkipListFactory>(new SkipListFactory)),
table_factory(
std::shared_ptr<TableFactory>(new BlockBasedTableFactory())),
inplace_update_support(false),
inplace_update_num_locks(10000),
inplace_callback(nullptr),
memtable_prefix_bloom_bits(0),
memtable_prefix_bloom_probes(6),
bloom_locality(0),
max_successive_merges(0),
min_partial_merge_operands(2) {
assert(memtable_factory.get() != nullptr);
}
ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
: comparator(options.comparator),
merge_operator(options.merge_operator),
compaction_filter(options.compaction_filter),
compaction_filter_factory(options.compaction_filter_factory),
compaction_filter_factory_v2(options.compaction_filter_factory_v2),
write_buffer_size(options.write_buffer_size),
max_write_buffer_number(options.max_write_buffer_number),
min_write_buffer_number_to_merge(
options.min_write_buffer_number_to_merge),
block_cache(options.block_cache),
block_cache_compressed(options.block_cache_compressed),
block_size(options.block_size),
block_restart_interval(options.block_restart_interval),
compression(options.compression),
compression_per_level(options.compression_per_level),
compression_opts(options.compression_opts),
filter_policy(options.filter_policy),
prefix_extractor(options.prefix_extractor),
whole_key_filtering(options.whole_key_filtering),
num_levels(options.num_levels),
level0_file_num_compaction_trigger(
options.level0_file_num_compaction_trigger),
level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
level0_stop_writes_trigger(options.level0_stop_writes_trigger),
max_mem_compaction_level(options.max_mem_compaction_level),
target_file_size_base(options.target_file_size_base),
target_file_size_multiplier(options.target_file_size_multiplier),
max_bytes_for_level_base(options.max_bytes_for_level_base),
max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
max_bytes_for_level_multiplier_additional(
options.max_bytes_for_level_multiplier_additional),
expanded_compaction_factor(options.expanded_compaction_factor),
source_compaction_factor(options.source_compaction_factor),
max_grandparent_overlap_factor(options.max_grandparent_overlap_factor),
disable_seek_compaction(options.disable_seek_compaction),
soft_rate_limit(options.soft_rate_limit),
hard_rate_limit(options.hard_rate_limit),
rate_limit_delay_max_milliseconds(
options.rate_limit_delay_max_milliseconds),
no_block_cache(options.no_block_cache),
arena_block_size(options.arena_block_size),
disable_auto_compactions(options.disable_auto_compactions),
purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush),
block_size_deviation(options.block_size_deviation),
compaction_style(options.compaction_style),
verify_checksums_in_compaction(options.verify_checksums_in_compaction),
compaction_options_universal(options.compaction_options_universal),
filter_deletes(options.filter_deletes),
max_sequential_skip_in_iterations(
options.max_sequential_skip_in_iterations),
memtable_factory(options.memtable_factory),
table_factory(options.table_factory),
table_properties_collectors(options.table_properties_collectors),
inplace_update_support(options.inplace_update_support),
inplace_update_num_locks(options.inplace_update_num_locks),
inplace_callback(options.inplace_callback),
memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
bloom_locality(options.bloom_locality),
max_successive_merges(options.max_successive_merges),
min_partial_merge_operands(options.min_partial_merge_operands) {
assert(memtable_factory.get() != nullptr);
}
DBOptions::DBOptions()
: create_if_missing(false),
error_if_exists(false),
paranoid_checks(true),
env(Env::Default()),
info_log(nullptr),
info_log_level(INFO),
max_open_files(5000),
statistics(nullptr),
disableDataSync(false), disableDataSync(false),
use_fsync(false), use_fsync(false),
db_stats_log_interval(1800), db_stats_log_interval(1800),
db_log_dir(""), db_log_dir(""),
wal_dir(""), wal_dir(""),
disable_seek_compaction(true),
delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL), delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL),
max_background_compactions(1), max_background_compactions(1),
max_background_flushes(1), max_background_flushes(1),
max_log_file_size(0), max_log_file_size(0),
log_file_time_to_roll(0), log_file_time_to_roll(0),
keep_log_file_num(1000), keep_log_file_num(1000),
soft_rate_limit(0.0),
hard_rate_limit(0.0),
rate_limit_delay_max_milliseconds(1000),
max_manifest_file_size(std::numeric_limits<uint64_t>::max()), max_manifest_file_size(std::numeric_limits<uint64_t>::max()),
no_block_cache(false),
table_cache_numshardbits(4), table_cache_numshardbits(4),
table_cache_remove_scan_count_limit(16), table_cache_remove_scan_count_limit(16),
arena_block_size(0),
disable_auto_compactions(false),
WAL_ttl_seconds(0), WAL_ttl_seconds(0),
WAL_size_limit_MB(0), WAL_size_limit_MB(0),
manifest_preallocation_size(4 * 1024 * 1024), manifest_preallocation_size(4 * 1024 * 1024),
purge_redundant_kvs_while_flush(true),
allow_os_buffer(true), allow_os_buffer(true),
allow_mmap_reads(false), allow_mmap_reads(false),
allow_mmap_writes(false), allow_mmap_writes(false),
is_fd_close_on_exec(true), is_fd_close_on_exec(true),
skip_log_error_on_recovery(false), skip_log_error_on_recovery(false),
stats_dump_period_sec(3600), stats_dump_period_sec(3600),
block_size_deviation(10),
advise_random_on_open(true), advise_random_on_open(true),
access_hint_on_compaction_start(NORMAL), access_hint_on_compaction_start(NORMAL),
use_adaptive_mutex(false), use_adaptive_mutex(false),
bytes_per_sync(0), bytes_per_sync(0),
compaction_style(kCompactionStyleLevel), allow_thread_local(true) {}
verify_checksums_in_compaction(true),
filter_deletes(false), DBOptions::DBOptions(const Options& options)
max_sequential_skip_in_iterations(8), : create_if_missing(options.create_if_missing),
memtable_factory(std::shared_ptr<SkipListFactory>(new SkipListFactory)), error_if_exists(options.error_if_exists),
table_factory( paranoid_checks(options.paranoid_checks),
std::shared_ptr<TableFactory>(new BlockBasedTableFactory())), env(options.env),
inplace_update_support(false), info_log(options.info_log),
inplace_update_num_locks(10000), info_log_level(options.info_log_level),
inplace_callback(nullptr), max_open_files(options.max_open_files),
memtable_prefix_bloom_bits(0), statistics(options.statistics),
memtable_prefix_bloom_probes(6), disableDataSync(options.disableDataSync),
bloom_locality(0), use_fsync(options.use_fsync),
max_successive_merges(0), db_stats_log_interval(options.db_stats_log_interval),
min_partial_merge_operands(2), db_log_dir(options.db_log_dir),
allow_thread_local(true) { wal_dir(options.wal_dir),
assert(memtable_factory.get() != nullptr); delete_obsolete_files_period_micros(
} options.delete_obsolete_files_period_micros),
max_background_compactions(options.max_background_compactions),
max_background_flushes(options.max_background_flushes),
max_log_file_size(options.max_log_file_size),
log_file_time_to_roll(options.log_file_time_to_roll),
keep_log_file_num(options.keep_log_file_num),
max_manifest_file_size(options.max_manifest_file_size),
table_cache_numshardbits(options.table_cache_numshardbits),
table_cache_remove_scan_count_limit(
options.table_cache_remove_scan_count_limit),
WAL_ttl_seconds(options.WAL_ttl_seconds),
WAL_size_limit_MB(options.WAL_size_limit_MB),
manifest_preallocation_size(options.manifest_preallocation_size),
allow_os_buffer(options.allow_os_buffer),
allow_mmap_reads(options.allow_mmap_reads),
allow_mmap_writes(options.allow_mmap_writes),
is_fd_close_on_exec(options.is_fd_close_on_exec),
skip_log_error_on_recovery(options.skip_log_error_on_recovery),
stats_dump_period_sec(options.stats_dump_period_sec),
advise_random_on_open(options.advise_random_on_open),
access_hint_on_compaction_start(options.access_hint_on_compaction_start),
use_adaptive_mutex(options.use_adaptive_mutex),
bytes_per_sync(options.bytes_per_sync),
allow_thread_local(options.allow_thread_local) {}
static const char* const access_hints[] = { static const char* const access_hints[] = {
"NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED"
}; };
void void DBOptions::Dump(Logger* log) const {
Options::Dump(Logger* log) const
{
Log(log," Options.comparator: %s", comparator->Name());
Log(log," Options.merge_operator: %s",
merge_operator? merge_operator->Name() : "None");
Log(log," Options.compaction_filter: %s",
compaction_filter? compaction_filter->Name() : "None");
Log(log," Options.compaction_filter_factory: %s",
compaction_filter_factory->Name());
Log(log, " Options.compaction_filter_factory_v2: %s",
compaction_filter_factory_v2->Name());
Log(log," Options.memtable_factory: %s",
memtable_factory->Name());
Log(log," Options.table_factory: %s", table_factory->Name());
Log(log," Options.error_if_exists: %d", error_if_exists); Log(log," Options.error_if_exists: %d", error_if_exists);
Log(log," Options.create_if_missing: %d", create_if_missing); Log(log," Options.create_if_missing: %d", create_if_missing);
Log(log," Options.paranoid_checks: %d", paranoid_checks); Log(log," Options.paranoid_checks: %d", paranoid_checks);
Log(log," Options.env: %p", env); Log(log," Options.env: %p", env);
Log(log," Options.info_log: %p", info_log.get()); Log(log," Options.info_log: %p", info_log.get());
Log(log," Options.write_buffer_size: %zd", write_buffer_size);
Log(log," Options.max_write_buffer_number: %d", max_write_buffer_number);
Log(log," Options.max_open_files: %d", max_open_files); Log(log," Options.max_open_files: %d", max_open_files);
Log(log, " Options.disableDataSync: %d", disableDataSync);
Log(log, " Options.use_fsync: %d", use_fsync);
Log(log, " Options.max_log_file_size: %zu", max_log_file_size);
Log(log, "Options.max_manifest_file_size: %lu",
(unsigned long)max_manifest_file_size);
Log(log, " Options.log_file_time_to_roll: %zu", log_file_time_to_roll);
Log(log, " Options.keep_log_file_num: %zu", keep_log_file_num);
Log(log, " Options.db_stats_log_interval: %d", db_stats_log_interval);
Log(log, " Options.allow_os_buffer: %d", allow_os_buffer);
Log(log, " Options.allow_mmap_reads: %d", allow_mmap_reads);
Log(log, " Options.allow_mmap_writes: %d", allow_mmap_writes);
Log(log, " Options.db_log_dir: %s",
db_log_dir.c_str());
Log(log, " Options.wal_dir: %s",
wal_dir.c_str());
Log(log, " Options.table_cache_numshardbits: %d",
table_cache_numshardbits);
Log(log, " Options.table_cache_remove_scan_count_limit: %d",
table_cache_remove_scan_count_limit);
Log(log, " Options.delete_obsolete_files_period_micros: %lu",
(unsigned long)delete_obsolete_files_period_micros);
Log(log, " Options.max_background_compactions: %d",
max_background_compactions);
Log(log, " Options.max_background_flushes: %d",
max_background_flushes);
Log(log, " Options.WAL_ttl_seconds: %lu",
(unsigned long)WAL_ttl_seconds);
Log(log, " Options.WAL_size_limit_MB: %lu",
(unsigned long)WAL_size_limit_MB);
Log(log, " Options.manifest_preallocation_size: %zu",
manifest_preallocation_size);
Log(log, " Options.allow_os_buffer: %d",
allow_os_buffer);
Log(log, " Options.allow_mmap_reads: %d",
allow_mmap_reads);
Log(log, " Options.allow_mmap_writes: %d",
allow_mmap_writes);
Log(log, " Options.is_fd_close_on_exec: %d",
is_fd_close_on_exec);
Log(log, " Options.skip_log_error_on_recovery: %d",
skip_log_error_on_recovery);
Log(log, " Options.stats_dump_period_sec: %u",
stats_dump_period_sec);
Log(log, " Options.advise_random_on_open: %d",
advise_random_on_open);
Log(log, " Options.access_hint_on_compaction_start: %s",
access_hints[access_hint_on_compaction_start]);
Log(log, " Options.use_adaptive_mutex: %d",
use_adaptive_mutex);
Log(log, " Options.bytes_per_sync: %lu",
(unsigned long)bytes_per_sync);
} // DBOptions::Dump
void ColumnFamilyOptions::Dump(Logger* log) const {
Log(log, " Options.comparator: %s", comparator->Name());
Log(log, " Options.merge_operator: %s",
merge_operator ? merge_operator->Name() : "None");
Log(log, " Options.compaction_filter_factory: %s",
compaction_filter_factory->Name());
Log(log, " Options.compaction_filter_factory_v2: %s",
compaction_filter_factory_v2->Name());
Log(log, " Options.memtable_factory: %s", memtable_factory->Name());
Log(log, " Options.table_factory: %s", table_factory->Name());
Log(log, " Options.write_buffer_size: %zd", write_buffer_size);
Log(log, " Options.max_write_buffer_number: %d", max_write_buffer_number);
Log(log," Options.block_cache: %p", block_cache.get()); Log(log," Options.block_cache: %p", block_cache.get());
Log(log," Options.block_cache_compressed: %p", Log(log," Options.block_cache_compressed: %p",
block_cache_compressed.get()); block_cache_compressed.get());
@ -173,18 +331,6 @@ Options::Dump(Logger* log) const
prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name()); prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name());
Log(log," Options.whole_key_filtering: %d", whole_key_filtering); Log(log," Options.whole_key_filtering: %d", whole_key_filtering);
Log(log," Options.num_levels: %d", num_levels); Log(log," Options.num_levels: %d", num_levels);
Log(log," Options.disableDataSync: %d", disableDataSync);
Log(log," Options.use_fsync: %d", use_fsync);
Log(log," Options.max_log_file_size: %zu", max_log_file_size);
Log(log,"Options.max_manifest_file_size: %lu",
(unsigned long)max_manifest_file_size);
Log(log," Options.log_file_time_to_roll: %zu", log_file_time_to_roll);
Log(log," Options.keep_log_file_num: %zu", keep_log_file_num);
Log(log," Options.db_stats_log_interval: %d",
db_stats_log_interval);
Log(log," Options.allow_os_buffer: %d", allow_os_buffer);
Log(log," Options.allow_mmap_reads: %d", allow_mmap_reads);
Log(log," Options.allow_mmap_writes: %d", allow_mmap_writes);
Log(log," Options.min_write_buffer_number_to_merge: %d", Log(log," Options.min_write_buffer_number_to_merge: %d",
min_write_buffer_number_to_merge); min_write_buffer_number_to_merge);
Log(log," Options.purge_redundant_kvs_while_flush: %d", Log(log," Options.purge_redundant_kvs_while_flush: %d",
@ -223,26 +369,12 @@ Options::Dump(Logger* log) const
source_compaction_factor); source_compaction_factor);
Log(log," Options.max_grandparent_overlap_factor: %d", Log(log," Options.max_grandparent_overlap_factor: %d",
max_grandparent_overlap_factor); max_grandparent_overlap_factor);
Log(log," Options.db_log_dir: %s",
db_log_dir.c_str());
Log(log," Options.wal_dir: %s",
wal_dir.c_str());
Log(log," Options.disable_seek_compaction: %d", Log(log," Options.disable_seek_compaction: %d",
disable_seek_compaction); disable_seek_compaction);
Log(log," Options.no_block_cache: %d", Log(log," Options.no_block_cache: %d",
no_block_cache); no_block_cache);
Log(log," Options.table_cache_numshardbits: %d",
table_cache_numshardbits);
Log(log," Options.table_cache_remove_scan_count_limit: %d",
table_cache_remove_scan_count_limit);
Log(log," Options.arena_block_size: %zu", Log(log," Options.arena_block_size: %zu",
arena_block_size); arena_block_size);
Log(log," Options.delete_obsolete_files_period_micros: %lu",
(unsigned long)delete_obsolete_files_period_micros);
Log(log," Options.max_background_compactions: %d",
max_background_compactions);
Log(log," Options.max_background_flushes: %d",
max_background_flushes);
Log(log," Options.soft_rate_limit: %.2f", Log(log," Options.soft_rate_limit: %.2f",
soft_rate_limit); soft_rate_limit);
Log(log," Options.hard_rate_limit: %.2f", Log(log," Options.hard_rate_limit: %.2f",
@ -251,36 +383,10 @@ Options::Dump(Logger* log) const
rate_limit_delay_max_milliseconds); rate_limit_delay_max_milliseconds);
Log(log," Options.disable_auto_compactions: %d", Log(log," Options.disable_auto_compactions: %d",
disable_auto_compactions); disable_auto_compactions);
Log(log," Options.WAL_ttl_seconds: %lu",
(unsigned long)WAL_ttl_seconds);
Log(log," Options.WAL_size_limit_MB: %lu",
(unsigned long)WAL_size_limit_MB);
Log(log," Options.manifest_preallocation_size: %zu",
manifest_preallocation_size);
Log(log," Options.purge_redundant_kvs_while_flush: %d", Log(log," Options.purge_redundant_kvs_while_flush: %d",
purge_redundant_kvs_while_flush); purge_redundant_kvs_while_flush);
Log(log," Options.allow_os_buffer: %d",
allow_os_buffer);
Log(log," Options.allow_mmap_reads: %d",
allow_mmap_reads);
Log(log," Options.allow_mmap_writes: %d",
allow_mmap_writes);
Log(log," Options.is_fd_close_on_exec: %d",
is_fd_close_on_exec);
Log(log," Options.skip_log_error_on_recovery: %d",
skip_log_error_on_recovery);
Log(log," Options.stats_dump_period_sec: %u",
stats_dump_period_sec);
Log(log," Options.block_size_deviation: %d", Log(log," Options.block_size_deviation: %d",
block_size_deviation); block_size_deviation);
Log(log," Options.advise_random_on_open: %d",
advise_random_on_open);
Log(log," Options.access_hint_on_compaction_start: %s",
access_hints[access_hint_on_compaction_start]);
Log(log," Options.use_adaptive_mutex: %d",
use_adaptive_mutex);
Log(log," Options.bytes_per_sync: %lu",
(unsigned long)bytes_per_sync);
Log(log," Options.filter_deletes: %d", Log(log," Options.filter_deletes: %d",
filter_deletes); filter_deletes);
Log(log, " Options.verify_checksums_in_compaction: %d", Log(log, " Options.verify_checksums_in_compaction: %d",
@ -317,8 +423,15 @@ Options::Dump(Logger* log) const
memtable_prefix_bloom_bits); memtable_prefix_bloom_bits);
Log(log, " Options.memtable_prefix_bloom_probes: %d", Log(log, " Options.memtable_prefix_bloom_probes: %d",
memtable_prefix_bloom_probes); memtable_prefix_bloom_probes);
Log(log, " Options.bloom_locality: %d",
bloom_locality);
Log(log, " Options.max_successive_merges: %zd", Log(log, " Options.max_successive_merges: %zd",
max_successive_merges); max_successive_merges);
} // ColumnFamilyOptions::Dump
void Options::Dump(Logger* log) const {
DBOptions::Dump(log);
ColumnFamilyOptions::Dump(log);
} // Options::Dump } // Options::Dump
// //

@ -9,12 +9,21 @@
namespace rocksdb { namespace rocksdb {
// by default, enable counts only #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
PerfLevel perf_level = kEnableCount; PerfLevel perf_level = kEnableCount;
// This is a dummy variable since some place references it
PerfContext perf_context;
#else
__thread PerfLevel perf_level = kEnableCount;
__thread PerfContext perf_context;
#endif
void SetPerfLevel(PerfLevel level) { perf_level = level; } void SetPerfLevel(PerfLevel level) {
perf_level = level;
}
void PerfContext::Reset() { void PerfContext::Reset() {
#if !defined(NPERF_CONTEXT) && !defined(IOS_CROSS_COMPILE)
user_key_comparison_count = 0; user_key_comparison_count = 0;
block_cache_hit_count = 0; block_cache_hit_count = 0;
block_read_count = 0; block_read_count = 0;
@ -38,11 +47,15 @@ void PerfContext::Reset() {
find_next_user_entry_time = 0; find_next_user_entry_time = 0;
write_pre_and_post_process_time = 0; write_pre_and_post_process_time = 0;
write_memtable_time = 0; write_memtable_time = 0;
#endif
} }
#define OUTPUT(counter) #counter << " = " << counter << ", " #define OUTPUT(counter) #counter << " = " << counter << ", "
std::string PerfContext::ToString() const { std::string PerfContext::ToString() const {
#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
return "";
#else
std::ostringstream ss; std::ostringstream ss;
ss << OUTPUT(user_key_comparison_count) ss << OUTPUT(user_key_comparison_count)
<< OUTPUT(block_cache_hit_count) << OUTPUT(block_cache_hit_count)
@ -67,8 +80,7 @@ std::string PerfContext::ToString() const {
<< OUTPUT(write_pre_and_post_process_time) << OUTPUT(write_pre_and_post_process_time)
<< OUTPUT(write_memtable_time); << OUTPUT(write_memtable_time);
return ss.str(); return ss.str();
#endif
} }
__thread PerfContext perf_context;
} }

@ -9,26 +9,80 @@
namespace rocksdb { namespace rocksdb {
extern enum PerfLevel perf_level; #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
inline void StartPerfTimer(StopWatchNano* timer) { #define PERF_TIMER_DECLARE()
if (perf_level >= PerfLevel::kEnableTime) { #define PERF_TIMER_START(metric)
timer->Start(); #define PERF_TIMER_AUTO(metric)
#define PERF_TIMER_MEASURE(metric)
#define PERF_TIMER_STOP(metric)
#define PERF_COUNTER_ADD(metric, value)
#else
extern __thread PerfLevel perf_level;
class PerfStepTimer {
public:
PerfStepTimer()
: enabled_(perf_level >= PerfLevel::kEnableTime),
env_(enabled_ ? Env::Default() : nullptr),
start_(0) {
} }
}
inline void BumpPerfCount(uint64_t* count, uint64_t delta = 1) { void Start() {
if (perf_level >= PerfLevel::kEnableCount) { if (enabled_) {
*count += delta; start_ = env_->NowNanos();
}
} }
}
inline void BumpPerfTime(uint64_t* time, void Measure(uint64_t* metric) {
StopWatchNano* timer, if (start_) {
bool reset = true) { uint64_t now = env_->NowNanos();
if (perf_level >= PerfLevel::kEnableTime) { *metric += now - start_;
*time += timer->ElapsedNanos(reset); start_ = now;
}
} }
}
void Stop(uint64_t* metric) {
if (start_) {
*metric += env_->NowNanos() - start_;
start_ = 0;
}
}
private:
const bool enabled_;
Env* const env_;
uint64_t start_;
};
// Declare the local timer object to be used later on
#define PERF_TIMER_DECLARE() \
PerfStepTimer perf_step_timer;
// Set start time of the timer
#define PERF_TIMER_START(metric) \
perf_step_timer.Start();
// Declare and set start time of the timer
#define PERF_TIMER_AUTO(metric) \
PerfStepTimer perf_step_timer; \
perf_step_timer.Start();
// Update metric with time elapsed since last START. start time is reset
// to current timestamp.
#define PERF_TIMER_MEASURE(metric) \
perf_step_timer.Measure(&(perf_context.metric));
// Update metric with time elapsed since last START. But start time is not set.
#define PERF_TIMER_STOP(metric) \
perf_step_timer.Stop(&(perf_context.metric));
// Increase metric value
#define PERF_COUNTER_ADD(metric, value) \
perf_context.metric += value;
#endif
} }

@ -13,13 +13,13 @@ class SkipListRep : public MemTableRep {
SkipList<const char*, const MemTableRep::KeyComparator&> skip_list_; SkipList<const char*, const MemTableRep::KeyComparator&> skip_list_;
public: public:
explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena) explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena)
: skip_list_(compare, arena) { : MemTableRep(arena), skip_list_(compare, arena) {
} }
// Insert key into the list. // Insert key into the list.
// REQUIRES: nothing that compares equal to key is currently in the list. // REQUIRES: nothing that compares equal to key is currently in the list.
virtual void Insert(const char* key) override { virtual void Insert(KeyHandle handle) override {
skip_list_.Insert(key); skip_list_.Insert(static_cast<char*>(handle));
} }
// Returns true iff an entry that compares equal to key is in the list. // Returns true iff an entry that compares equal to key is in the list.

@ -0,0 +1,62 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "util/sync_point.h"
namespace rocksdb {
SyncPoint* SyncPoint::GetInstance() {
static SyncPoint sync_point;
return &sync_point;
}
void SyncPoint::LoadDependency(const std::vector<Dependency>& dependencies) {
successors_.clear();
predecessors_.clear();
cleared_points_.clear();
for (const auto& dependency : dependencies) {
successors_[dependency.predecessor].push_back(dependency.successor);
predecessors_[dependency.successor].push_back(dependency.predecessor);
}
}
bool SyncPoint::PredecessorsAllCleared(const std::string& point) {
for (const auto& pred : predecessors_[point]) {
if (cleared_points_.count(pred) == 0) {
return false;
}
}
return true;
}
void SyncPoint::EnableProcessing() {
std::unique_lock<std::mutex> lock(mutex_);
enabled_ = true;
}
void SyncPoint::DisableProcessing() {
std::unique_lock<std::mutex> lock(mutex_);
enabled_ = false;
}
void SyncPoint::ClearTrace() {
std::unique_lock<std::mutex> lock(mutex_);
cleared_points_.clear();
}
void SyncPoint::Process(const std::string& point) {
std::unique_lock<std::mutex> lock(mutex_);
if (!enabled_) return;
while (!PredecessorsAllCleared(point)) {
cv_.wait(lock);
}
cleared_points_.insert(point);
cv_.notify_all();
}
} // namespace rocksdb

@ -0,0 +1,79 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <condition_variable>
#include <mutex>
#include <string>
#include <unordered_set>
#include <unordered_map>
#include <vector>
namespace rocksdb {
// This class provides facility to reproduce race conditions deterministically
// in unit tests.
// Developer could specify sync points in the codebase via TEST_SYNC_POINT.
// Each sync point represents a position in the execution stream of a thread.
// In the unit test, 'Happens After' relationship among sync points could be
// setup via SyncPoint::LoadDependency, to reproduce a desired interleave of
// threads execution.
// Refer to (DBTest,TransactionLogIteratorRace), for an exmaple use case.
class SyncPoint {
public:
static SyncPoint* GetInstance();
struct Dependency {
std::string predecessor;
std::string successor;
};
// call once at the beginning of a test to setup the dependency between
// sync points
void LoadDependency(const std::vector<Dependency>& dependencies);
// enable sync point processing (disabled on startup)
void EnableProcessing();
// disable sync point processing
void DisableProcessing();
// remove the execution trace of all sync points
void ClearTrace();
// triggered by TEST_SYNC_POINT, blocking execution until all predecessors
// are executed.
void Process(const std::string& point);
// TODO: it might be useful to provide a function that blocks until all
// sync points are cleared.
private:
bool PredecessorsAllCleared(const std::string& point);
// successor/predecessor map loaded from LoadDependency
std::unordered_map<std::string, std::vector<std::string>> successors_;
std::unordered_map<std::string, std::vector<std::string>> predecessors_;
std::mutex mutex_;
std::condition_variable cv_;
// sync points that have been passed through
std::unordered_set<std::string> cleared_points_;
bool enabled_ = false;
};
} // namespace rocksdb
// Use TEST_SYNC_POINT to specify sync points inside code base.
// Sync points can have happens-after depedency on other sync points,
// configured at runtime via SyncPoint::LoadDependency. This could be
// utilized to re-produce race conditions between threads.
// See TransactionLogIteratorRace in db_test.cc for an example use case.
// TEST_SYNC_POINT is no op in release build.
#ifdef NDEBUG
#define TEST_SYNC_POINT(x)
#else
#define TEST_SYNC_POINT(x) rocksdb::SyncPoint::GetInstance()->Process(x)
#endif

@ -16,6 +16,7 @@
#include "util/autovector.h" #include "util/autovector.h"
#include "port/port_posix.h" #include "port/port_posix.h"
#include "util/thread_local.h"
namespace rocksdb { namespace rocksdb {

@ -30,7 +30,7 @@ class VectorRep : public MemTableRep {
// single buffer and pass that in as the parameter to Insert) // single buffer and pass that in as the parameter to Insert)
// REQUIRES: nothing that compares equal to key is currently in the // REQUIRES: nothing that compares equal to key is currently in the
// collection. // collection.
virtual void Insert(const char* key) override; virtual void Insert(KeyHandle handle) override;
// Returns true iff an entry that compares equal to key is in the collection. // Returns true iff an entry that compares equal to key is in the collection.
virtual bool Contains(const char* key) const override; virtual bool Contains(const char* key) const override;
@ -106,7 +106,8 @@ class VectorRep : public MemTableRep {
const KeyComparator& compare_; const KeyComparator& compare_;
}; };
void VectorRep::Insert(const char* key) { void VectorRep::Insert(KeyHandle handle) {
auto* key = static_cast<char*>(handle);
assert(!Contains(key)); assert(!Contains(key));
WriteLock l(&rwlock_); WriteLock l(&rwlock_);
assert(!immutable_); assert(!immutable_);
@ -134,7 +135,8 @@ size_t VectorRep::ApproximateMemoryUsage() {
} }
VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count) VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count)
: bucket_(new Bucket()), : MemTableRep(arena),
bucket_(new Bucket()),
immutable_(false), immutable_(false),
sorted_(false), sorted_(false),
compare_(compare) { bucket_.get()->reserve(count); } compare_(compare) { bucket_.get()->reserve(count); }

@ -44,7 +44,9 @@ class DummyDB : public StackableDB {
return options_.env; return options_.env;
} }
virtual const Options& GetOptions() const override { using DB::GetOptions;
virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const
override {
return options_; return options_;
} }
@ -68,6 +70,10 @@ class DummyDB : public StackableDB {
return Status::OK(); return Status::OK();
} }
virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
return nullptr;
}
class DummyLogFile : public LogFile { class DummyLogFile : public LogFile {
public: public:
/* implicit */ /* implicit */
@ -345,7 +351,7 @@ class BackupableDBTest {
options_.wal_dir = dbname_; options_.wal_dir = dbname_;
// set up backup db options // set up backup db options
CreateLoggerFromOptions(dbname_, backupdir_, env_, CreateLoggerFromOptions(dbname_, backupdir_, env_,
Options(), &logger_); DBOptions(), &logger_);
backupable_options_.reset(new BackupableDBOptions( backupable_options_.reset(new BackupableDBOptions(
backupdir_, test_backup_env_.get(), true, logger_.get(), true)); backupdir_, test_backup_env_.get(), true, logger_.get(), true));
@ -425,6 +431,19 @@ class BackupableDBTest {
} }
} }
void DeleteLogFiles() {
std::vector<std::string> delete_logs;
env_->GetChildren(dbname_, &delete_logs);
for (auto f : delete_logs) {
uint64_t number;
FileType type;
bool ok = ParseFileName(f, &number, &type);
if (ok && type == kLogFile) {
env_->DeleteFile(dbname_ + "/" + f);
}
}
}
// files // files
std::string dbname_; std::string dbname_;
std::string backupdir_; std::string backupdir_;
@ -721,10 +740,11 @@ TEST(BackupableDBTest, FailOverwritingBackups) {
// create backups 1, 2, 3, 4, 5 // create backups 1, 2, 3, 4, 5
OpenBackupableDB(true); OpenBackupableDB(true);
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
FillDB(db_.get(), 100 * i, 100 * (i + 1));
ASSERT_OK(db_->CreateNewBackup(true));
CloseBackupableDB(); CloseBackupableDB();
DeleteLogFiles();
OpenBackupableDB(false); OpenBackupableDB(false);
FillDB(db_.get(), 100 * i, 100 * (i + 1));
ASSERT_OK(db_->CreateNewBackup(true));
} }
CloseBackupableDB(); CloseBackupableDB();
@ -826,7 +846,7 @@ TEST(BackupableDBTest, RateLimiting) {
auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) / auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) /
backupable_options_->backup_rate_limit; backupable_options_->backup_rate_limit;
ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time); ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time);
ASSERT_LT(backup_time, 1.5 * rate_limited_backup_time); ASSERT_LT(backup_time, 2.5 * rate_limited_backup_time);
CloseBackupableDB(); CloseBackupableDB();
@ -838,7 +858,7 @@ TEST(BackupableDBTest, RateLimiting) {
auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) / auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) /
backupable_options_->restore_rate_limit; backupable_options_->restore_rate_limit;
ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time); ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time);
ASSERT_LT(restore_time, 1.5 * rate_limited_restore_time); ASSERT_LT(restore_time, 2.5 * rate_limited_restore_time);
AssertBackupConsistency(0, 0, 100000, 100010); AssertBackupConsistency(0, 0, 100000, 100010);
} }

@ -35,7 +35,7 @@ class GeoDBTest {
} }
}; };
const std::string GeoDBTest::kDefaultDbName = "/tmp/geodefault/"; const std::string GeoDBTest::kDefaultDbName = "/tmp/geodefault";
Options GeoDBTest::options = Options(); Options GeoDBTest::options = Options();
// Insert, Get and Remove // Insert, Get and Remove
@ -106,14 +106,14 @@ TEST(GeoDBTest, Search) {
std::vector<GeoObject> values; std::vector<GeoObject> values;
status = getdb()->SearchRadial(GeoPosition(46, 46), 200000, &values); status = getdb()->SearchRadial(GeoPosition(46, 46), 200000, &values);
ASSERT_TRUE(status.ok()); ASSERT_TRUE(status.ok());
ASSERT_EQ(values.size(), 1); ASSERT_EQ(values.size(), 1U);
// search all objects centered at 46 degree latitude with // search all objects centered at 46 degree latitude with
// a radius of 2 kilometers. There should be none. // a radius of 2 kilometers. There should be none.
values.clear(); values.clear();
status = getdb()->SearchRadial(GeoPosition(46, 46), 2, &values); status = getdb()->SearchRadial(GeoPosition(46, 46), 2, &values);
ASSERT_TRUE(status.ok()); ASSERT_TRUE(status.ok());
ASSERT_EQ(values.size(), 0); ASSERT_EQ(values.size(), 0U);
} }
} // namespace rocksdb } // namespace rocksdb

@ -119,15 +119,16 @@ Status DBWithTTL::StripTS(std::string* str) {
return st; return st;
} }
Status DBWithTTL::Put(const WriteOptions& opt, const Slice& key, Status DBWithTTL::Put(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& val) { const Slice& val) {
WriteBatch batch; WriteBatch batch;
batch.Put(key, val); batch.Put(key, val);
return Write(opt, &batch); return Write(options, &batch);
} }
Status DBWithTTL::Get(const ReadOptions& options, Status DBWithTTL::Get(const ReadOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) { std::string* value) {
Status st = db_->Get(options, key, value); Status st = db_->Get(options, key, value);
if (!st.ok()) { if (!st.ok()) {
@ -140,18 +141,18 @@ Status DBWithTTL::Get(const ReadOptions& options,
return StripTS(value); return StripTS(value);
} }
std::vector<Status> DBWithTTL::MultiGet(const ReadOptions& options, std::vector<Status> DBWithTTL::MultiGet(
const std::vector<Slice>& keys, const ReadOptions& options,
std::vector<std::string>* values) { const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values) {
return std::vector<Status>(keys.size(), return std::vector<Status>(keys.size(),
Status::NotSupported("MultiGet not\ Status::NotSupported("MultiGet not\
supported with TTL")); supported with TTL"));
} }
bool DBWithTTL::KeyMayExist(const ReadOptions& options, bool DBWithTTL::KeyMayExist(const ReadOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
std::string* value, std::string* value, bool* value_found) {
bool* value_found) {
bool ret = db_->KeyMayExist(options, key, value, value_found); bool ret = db_->KeyMayExist(options, key, value, value_found);
if (ret && value != nullptr && value_found != nullptr && *value_found) { if (ret && value != nullptr && value_found != nullptr && *value_found) {
if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) { if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) {
@ -161,12 +162,12 @@ bool DBWithTTL::KeyMayExist(const ReadOptions& options,
return ret; return ret;
} }
Status DBWithTTL::Merge(const WriteOptions& opt, Status DBWithTTL::Merge(const WriteOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) { const Slice& value) {
WriteBatch batch; WriteBatch batch;
batch.Merge(key, value); batch.Merge(key, value);
return Write(opt, &batch); return Write(options, &batch);
} }
Status DBWithTTL::Write(const WriteOptions& opts, WriteBatch* updates) { Status DBWithTTL::Write(const WriteOptions& opts, WriteBatch* updates) {
@ -208,12 +209,9 @@ Status DBWithTTL::Write(const WriteOptions& opts, WriteBatch* updates) {
} }
} }
Iterator* DBWithTTL::NewIterator(const ReadOptions& opts) { Iterator* DBWithTTL::NewIterator(const ReadOptions& opts,
return new TtlIterator(db_->NewIterator(opts)); ColumnFamilyHandle* column_family) {
} return new TtlIterator(db_->NewIterator(opts, column_family));
void DBWithTTL::TEST_Destroy_DBWithTtl() {
((DBImpl*) db_)->TEST_Destroy_DBImpl();
} }
} // namespace rocksdb } // namespace rocksdb

@ -23,30 +23,39 @@ class DBWithTTL : public StackableDB {
virtual ~DBWithTTL(); virtual ~DBWithTTL();
virtual Status Put(const WriteOptions& o, const Slice& key, using StackableDB::Put;
virtual Status Put(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& val) override; const Slice& val) override;
virtual Status Get(const ReadOptions& options, const Slice& key, using StackableDB::Get;
virtual Status Get(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) override; std::string* value) override;
using StackableDB::MultiGet;
virtual std::vector<Status> MultiGet( virtual std::vector<Status> MultiGet(
const ReadOptions& options, const std::vector<Slice>& keys, const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys,
std::vector<std::string>* values) override; std::vector<std::string>* values) override;
using StackableDB::KeyMayExist;
virtual bool KeyMayExist(const ReadOptions& options, virtual bool KeyMayExist(const ReadOptions& options,
const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
std::string* value, std::string* value,
bool* value_found = nullptr) override; bool* value_found = nullptr) override;
virtual Status Merge(const WriteOptions& options, const Slice& key, using StackableDB::Merge;
virtual Status Merge(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override; const Slice& value) override;
virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
virtual Iterator* NewIterator(const ReadOptions& opts) override; using StackableDB::NewIterator;
virtual Iterator* NewIterator(const ReadOptions& opts,
// Simulate a db crash, no elegant closing of database. ColumnFamilyHandle* column_family) override;
void TEST_Destroy_DBWithTtl();
virtual DB* GetBaseDB() { virtual DB* GetBaseDB() {
return db_; return db_;

Loading…
Cancel
Save