Merge pull request #1 from facebook/master

merge from facebook
main
Wei Wei 10 years ago
commit d1cafc0892
  1. 17
      .gitignore
  2. 3
      .travis.yml
  3. 11
      AUTHORS
  4. 79
      HISTORY.md
  5. 16
      INSTALL.md
  6. 298
      Makefile
  7. 2
      README.md
  8. 16
      Vagrantfile
  9. 73
      build_tools/build_detect_platform
  10. 74
      build_tools/fbcode.clang31.sh
  11. 70
      build_tools/fbcode.gcc471.sh
  12. 86
      build_tools/fbcode.gcc481.sh
  13. 125
      build_tools/fbcode_config.sh
  14. 105
      build_tools/fbcode_config4.8.1.sh
  15. 25
      build_tools/mac-install-gflags.sh
  16. 116
      build_tools/make_package.sh
  17. 34
      build_tools/regression_build_test.sh
  18. 2
      build_tools/unity
  19. 14
      build_tools/version.sh
  20. 4
      coverage/coverage_test.sh
  21. 52
      db/builder.cc
  22. 11
      db/builder.h
  23. 184
      db/c.cc
  24. 155
      db/c_test.c
  25. 461
      db/column_family.cc
  26. 238
      db/column_family.h
  27. 71
      db/column_family_test.cc
  28. 135
      db/compaction.cc
  29. 73
      db/compaction.h
  30. 1095
      db/compaction_job.cc
  31. 127
      db/compaction_job.h
  32. 183
      db/compaction_job_test.cc
  33. 917
      db/compaction_picker.cc
  34. 265
      db/compaction_picker.h
  35. 259
      db/compaction_picker_test.cc
  36. 434
      db/comparator_db_test.cc
  37. 25
      db/corruption_test.cc
  38. 24
      db/cuckoo_table_db_test.cc
  39. 682
      db/db_bench.cc
  40. 86
      db/db_filesnapshot.cc
  41. 4320
      db/db_impl.cc
  42. 463
      db/db_impl.h
  43. 54
      db/db_impl_debug.cc
  44. 119
      db/db_impl_readonly.cc
  45. 28
      db/db_impl_readonly.h
  46. 186
      db/db_iter.cc
  47. 12
      db/db_iter.h
  48. 1423
      db/db_iter_test.cc
  49. 4414
      db/db_test.cc
  50. 9
      db/dbformat.cc
  51. 24
      db/dbformat.h
  52. 74
      db/deletefile_test.cc
  53. 742
      db/fault_injection_test.cc
  54. 34
      db/file_indexer.cc
  55. 14
      db/file_indexer.h
  56. 14
      db/file_indexer_test.cc
  57. 27
      db/filename.cc
  58. 16
      db/filename.h
  59. 221
      db/flush_job.cc
  60. 87
      db/flush_job.h
  61. 124
      db/flush_job_test.cc
  62. 63
      db/flush_scheduler.cc
  63. 40
      db/flush_scheduler.h
  64. 225
      db/forward_iterator.cc
  65. 10
      db/forward_iterator.h
  66. 210
      db/internal_stats.cc
  67. 123
      db/internal_stats.h
  68. 103
      db/job_context.h
  69. 401
      db/listener_test.cc
  70. 26
      db/log_and_apply_bench.cc
  71. 9
      db/log_reader.cc
  72. 6
      db/log_test.cc
  73. 144
      db/memtable.cc
  74. 70
      db/memtable.h
  75. 52
      db/memtable_allocator.cc
  76. 47
      db/memtable_allocator.h
  77. 67
      db/memtable_list.cc
  78. 22
      db/memtable_list.h
  79. 696
      db/memtablerep_bench.cc
  80. 21
      db/merge_helper.cc
  81. 79
      db/merge_test.cc
  82. 275
      db/perf_context_test.cc
  83. 64
      db/plain_table_db_test.cc
  84. 11
      db/prefix_test.cc
  85. 72
      db/repair.cc
  86. 810
      db/simple_table_db_test.cc
  87. 46
      db/skiplist.h
  88. 38
      db/skiplist_test.cc
  89. 23
      db/snapshot.h
  90. 55
      db/table_cache.cc
  91. 18
      db/table_cache.h
  92. 3
      db/table_properties_collector.cc
  93. 14
      db/table_properties_collector_test.cc
  94. 47
      db/transaction_log_impl.cc
  95. 33
      db/transaction_log_impl.h
  96. 330
      db/version_builder.cc
  97. 42
      db/version_builder.h
  98. 229
      db/version_builder_test.cc
  99. 16
      db/version_edit.cc
  100. 45
      db/version_edit.h
  101. Some files were not shown because too many files have changed in this diff Show More

17
.gitignore vendored

@ -1,5 +1,5 @@
TARGETS
build_config.mk
make_config.mk
*.a
*.arc
@ -20,6 +20,7 @@ build_config.mk
*.d-e
*.o-*
*.swp
*~
ldb
manifest_dump
@ -28,8 +29,22 @@ util/build_version.cc
build_tools/VALGRIND_LOGS/
coverage/COVERAGE_REPORT
.gdbhistory
package/
.phutil_module_cache
unity
tags
java/out
java/target
java/test-libs
java/*.log
java/include/org_rocksdb_*.h
.idea/
*.iml
unity.cc
java/crossbuild/.vagrant
.vagrant/
java/**.asc
java/javadoc

@ -14,7 +14,6 @@ before_install:
- sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
# Lousy hack to disable use and testing of fallocate, which doesn't behave quite
# as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
- sed -i "s/fallocate(/HACK_NO_fallocate(/" build_tools/build_detect_platform
script: make check -j8
script: OPT=-DTRAVIS make unity && make clean && OPT=-DTRAVIS make check
notifications:
email: false

@ -0,0 +1,11 @@
Facebook Inc.
Facebook Engineering Team
Google Inc.
# Initial version authors:
Jeffrey Dean <jeff@google.com>
Sanjay Ghemawat <sanjay@google.com>
# Partial list of contributors:
Kevin Regan <kevin.d.regan@gmail.com>
Johan Bilien <jobi@litl.com>

@ -1,6 +1,78 @@
# Rocksdb Change Log
### Unreleased
### Unreleased Features
* Changed the LRU caching algorithm so that referenced blocks (by iterators) are never evicted
* By default we now optimize the compilation for the compilation platform (using -march=native). If you want to build portable binary, use 'PORTABLE=1' before the make command.
* We now allow level-compaction to place files in different paths by
specifying them in db_paths along with the target_size.
Lower numbered levels will be placed earlier in the db_paths and higher
numbered levels will be placed later in the db_paths vector.
* Potentially big performance improvements if you're using RocksDB with lots of column families (100-1000)
* Added BlockBasedTableOptions.format_version option, which allows user to specify which version of block based table he wants. As a general guidline, newer versions have more features, but might not be readable by older versions of RocksDB.
* Added new block based table format (version 2), which you can enable by setting BlockBasedTableOptions.format_version = 2. This format changes how we encode size information in compressed blocks and should help with memory allocations if you're using Zlib or BZip2 compressions.
* GetThreadStatus() is now able to report compaction activity.
* MemEnv (env that stores data in memory) is now available in default library build. You can create it by calling NewMemEnv().
* Add SliceTransform.SameResultWhenAppended() to help users determine it is safe to apply prefix bloom/hash.
* Block based table now makes use of prefix bloom filter if it is a full fulter.
### Public API changes
* Deprecated skip_log_error_on_recovery option
* Logger method logv with log level parameter is now virtual
### 3.9.0 (12/8/2014)
### New Features
* Add rocksdb::GetThreadList(), which in the future will return the current status of all
rocksdb-related threads. We will have more code instruments in the following RocksDB
releases.
* Change convert function in rocksdb/utilities/convenience.h to return Status instead of boolean.
Also add support for nested options in convert function
### Public API changes
* New API to create a checkpoint added. Given a directory name, creates a new
database which is an image of the existing database.
* New API LinkFile added to Env. If you implement your own Env class, an
implementation of the API LinkFile will have to be provided.
* MemTableRep takes MemTableAllocator instead of Arena
### Improvements
* RocksDBLite library now becomes smaller and will be compiled with -fno-exceptions flag.
## 3.8.0 (11/14/2014)
### Public API changes
* BackupEngine::NewBackupEngine() was deprecated; please use BackupEngine::Open() from now on.
* BackupableDB/RestoreBackupableDB have new GarbageCollect() methods, which will clean up files from corrupt and obsolete backups.
* BackupableDB/RestoreBackupableDB have new GetCorruptedBackups() methods which list corrupt backups.
### Cleanup
* Bunch of code cleanup, some extra warnings turned on (-Wshadow, -Wshorten-64-to-32, -Wnon-virtual-dtor)
### New features
* CompactFiles and EventListener, although they are still in experimental state
* Full ColumnFamily support in RocksJava.
## 3.7.0 (11/6/2014)
### Public API changes
* Introduce SetOptions() API to allow adjusting a subset of options dynamically online
* Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString()
* Remove WriteBatchWithIndex.Delete() overloads using SliceParts
* When opening a DB, if options.max_background_compactions is larger than the existing low pri pool of options.env, it will enlarge it. Similarly, options.max_background_flushes is larger than the existing high pri pool of options.env, it will enlarge it.
## 3.6.0 (10/7/2014)
### Disk format changes
* If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy
### Behavior changes
* We have refactored our system of stalling writes. Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6.
* When disableDataSync=true, we no longer sync the MANIFEST file.
* Add identity_as_first_hash property to CuckooTable. SST file needs to be rebuilt to be opened by reader properly.
### Public API changes
* Change target_file_size_base type to uint64_t from int.
* Remove allow_thread_local. This feature was proved to be stable, so we are turning it always-on.
## 3.5.0 (9/3/2014)
### New Features
* Add include/utilities/write_batch_with_index.h, providing a utilitiy class to query data out of WriteBatch when building it.
* Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include:
@ -11,15 +83,12 @@
### Public API changes
* The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
----- Past Releases -----
## 3.4.0 (8/18/2014)
### New Features
* Support Multiple DB paths in universal style compactions
* Add feature of storing plain table index and bloom filter in SST file.
* CompactRange() will never output compacted files to level 0. This used to be the case when all the compaction input files were at level 0.
* Added iterate_upper_bound to define the extent upto which the forward iterator will return entries. This will prevent iterating over delete markers and overwritten entries for edge cases where you want to break out the iterator anyways. This may improve perfomance in case there are a large number of delete markers or overwritten entries.
### Public API changes
* DBOptions.db_paths now is a vector of a DBPath structure which indicates both of path and target size

@ -2,7 +2,7 @@
RocksDB's library should be able to compile without any dependency installed,
although we recommend installing some compression libraries (see below).
We do depend on newer gcc with C++11 support.
We do depend on newer gcc/clang with C++11 support.
There are few options when compiling RocksDB:
@ -15,6 +15,10 @@ There are few options when compiling RocksDB:
* `make all` will compile our static library, and all our tools and unit tests. Our tools
depend on gflags. You will need to have gflags installed to run `make all`.
* By default the binary we produce is optimized for the platform you're compiling on
(-march=native). If you want to build a portable binary, add 'PORTABLE=1' before
your make commands, like this: `PORTABLE=1 make static_lib`
## Dependencies
* You can link RocksDB with following compression libraries:
@ -72,13 +76,7 @@ depend on gflags. You will need to have gflags installed to run `make all`.
* Install via [homebrew](http://brew.sh/).
* If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
* run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher).
* Install zlib, bzip2 and snappy libraries for compression.
* Install gflags. We have included a script
`build_tools/mac-install-gflags.sh`, which should automatically install it (execute this file instead of runing using "source" command).
If you installed gflags by other means (for example, `brew install gflags`),
please set `LIBRARY_PATH` and `CPATH` accordingly.
* Please note that some of the optimizations/features are disabled in OSX.
We did not run any production workloads on it.
* run `brew install rocksdb`
* **iOS**:
* Run: `TARGET_OS=IOS make static_lib`
* Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`.

@ -3,12 +3,19 @@
# found in the LICENSE file. See the AUTHORS file for names of contributors.
# Inherit some settings from environment variables, if available
INSTALL_PATH ?= $(CURDIR)
#-----------------------------------------------
CFLAGS += ${EXTRA_CFLAGS}
CXXFLAGS += ${EXTRA_CXXFLAGS}
LDFLAGS += $(EXTRA_LDFLAGS)
MACHINE ?= $(shell uname -m)
ifneq ($(MAKECMDGOALS),dbg)
OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
OPT += -O2 -fno-omit-frame-pointer
ifneq ($(MACHINE),ppc64) # ppc64 doesn't support -momit-leaf-frame-pointer
OPT += -momit-leaf-frame-pointer
endif
else
# intentionally left blank
endif
@ -24,9 +31,9 @@ endif
#-----------------------------------------------
# detect what platform we're building on
$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/build_config.mk"))
$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
# this file is generated by the previous line to set build flags and sources
include build_config.mk
include make_config.mk
ifneq ($(PLATFORM), IOS)
CFLAGS += -g
@ -36,35 +43,71 @@ else
OPT += -DNDEBUG
endif
ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
# found
CFLAGS += -fno-exceptions
CXXFLAGS += -fno-exceptions
endif
# ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
ifdef COMPILE_WITH_ASAN
# ASAN compile flags
DISABLE_JEMALLOC=1
EXEC_LDFLAGS += -fsanitize=address
PLATFORM_CCFLAGS += -fsanitize=address
PLATFORM_CXXFLAGS += -fsanitize=address
else
# if we're not compiling with ASAN, use jemalloc
endif
# TSAN doesn't work well with jemalloc. If we're compiling with TSAN, we should use regular malloc.
ifdef COMPILE_WITH_TSAN
DISABLE_JEMALLOC=1
EXEC_LDFLAGS += -fsanitize=thread -pie
PLATFORM_CCFLAGS += -fsanitize=thread -fPIC -DROCKSDB_TSAN_RUN
PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC -DROCKSDB_TSAN_RUN
endif
ifndef DISABLE_JEMALLOC
EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS)
PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
endif
WARNING_FLAGS = -Wall -Werror -Wsign-compare
#-------------------------------------------------
# make install related stuff
INSTALL_PATH ?= /usr/local
uninstall:
@rm -rf $(INSTALL_PATH)/include/rocksdb
@rm -rf $(INSTALL_PATH)/lib/$(LIBRARY)
@rm -rf $(INSTALL_PATH)/lib/$(SHARED)
install:
@install -d $(INSTALL_PATH)/lib
@for header_dir in `find "include/rocksdb" -type d`; do \
install -d $(INSTALL_PATH)/$$header_dir; \
done
@for header in `find "include/rocksdb" -type f -name *.h`; do \
install -C -m 644 $$header $(INSTALL_PATH)/$$header; \
done
@[ ! -e $(LIBRARY) ] || install -C -m 644 $(LIBRARY) $(INSTALL_PATH)/lib
@[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib
#-------------------------------------------------
WARNING_FLAGS = -Wall -Werror -Wsign-compare -Wshadow
CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor
LDFLAGS += $(PLATFORM_LDFLAGS)
LIBOBJECTS = $(SOURCES:.cc=.o)
LIBOBJECTS += $(SOURCESCPP:.cpp=.o)
MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
MOCKOBJECTS = $(MOCK_SOURCES:.cc=.o)
TESTUTIL = ./util/testutil.o
TESTHARNESS = ./util/testharness.o $(TESTUTIL)
TESTHARNESS = ./util/testharness.o $(TESTUTIL) $(MOCKOBJECTS)
BENCHHARNESS = ./util/benchharness.o
VALGRIND_ERROR = 2
VALGRIND_DIR = build_tools/VALGRIND_LOGS
VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
TESTS = \
@ -85,22 +128,26 @@ TESTS = \
coding_test \
corruption_test \
crc32c_test \
slice_transform_test \
dbformat_test \
env_test \
blob_store_test \
fault_injection_test \
filelock_test \
filename_test \
filter_block_test \
block_based_filter_block_test \
full_filter_block_test \
histogram_test \
log_test \
manual_compaction_test \
memenv_test \
mock_env_test \
merge_test \
merger_test \
redis_test \
reduce_levels_test \
plain_table_db_test \
comparator_db_test \
prefix_test \
simple_table_db_test \
skiplist_test \
stringappend_test \
ttl_test \
@ -110,8 +157,11 @@ TESTS = \
spatial_db_test \
version_edit_test \
version_set_test \
compaction_picker_test \
version_builder_test \
file_indexer_test \
write_batch_test\
write_batch_test \
write_controller_test\
deletefile_test \
table_test \
thread_local_test \
@ -121,7 +171,14 @@ TESTS = \
cuckoo_table_builder_test \
cuckoo_table_reader_test \
cuckoo_table_db_test \
write_batch_with_index_test
flush_job_test \
wal_manager_test \
listener_test \
compaction_job_test \
thread_list_test \
sst_dump_test
SUBSET := $(shell echo $(TESTS) |sed s/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/)
TOOLS = \
sst_dump \
@ -129,10 +186,9 @@ TOOLS = \
db_stress \
ldb \
db_repl_stress \
options_test \
blob_store_bench
options_test \
PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS)
PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench perf_context_test memtablerep_bench $(TOOLS)
# The library name is configurable since we are maintaining libraries of both
# debug/release mode.
@ -140,7 +196,10 @@ ifeq ($(LIBNAME),)
LIBNAME=librocksdb
endif
LIBRARY = ${LIBNAME}.a
MEMENVLIBRARY = libmemenv.a
ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
default: all
@ -153,29 +212,33 @@ ifneq ($(PLATFORM_SHARED_VERSIONED),true)
SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
SHARED2 = $(SHARED1)
SHARED3 = $(SHARED1)
SHARED4 = $(SHARED1)
SHARED = $(SHARED1)
else
# Update db.h if you change these.
SHARED_MAJOR = 3
SHARED_MINOR = 4
SHARED_MAJOR = $(ROCKSDB_MAJOR)
SHARED_MINOR = $(ROCKSDB_MINOR)
SHARED_PATCH = $(ROCKSDB_PATCH)
SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
SHARED2 = $(SHARED1).$(SHARED_MAJOR)
SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
$(SHARED1): $(SHARED3)
ln -fs $(SHARED3) $(SHARED1)
$(SHARED2): $(SHARED3)
ln -fs $(SHARED3) $(SHARED2)
SHARED4 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR).$(SHARED_PATCH)
SHARED = $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4)
$(SHARED1): $(SHARED4)
ln -fs $(SHARED4) $(SHARED1)
$(SHARED2): $(SHARED4)
ln -fs $(SHARED4) $(SHARED2)
$(SHARED3): $(SHARED4)
ln -fs $(SHARED4) $(SHARED3)
endif
$(SHARED3):
$(SHARED4):
$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) $(LDFLAGS) -o $@
endif # PLATFORM_SHARED_EXT
.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \
release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
dbg
dbg rocksdbjavastatic rocksdbjava install uninstall analyze
all: $(LIBRARY) $(PROGRAMS) $(TESTS)
@ -201,6 +264,10 @@ check: $(TESTS) ldb
for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
python tools/ldb_test.py
check_some: $(SUBSET) ldb
for t in $(SUBSET); do echo "***** Running $$t"; ./$$t || exit 1; done
python tools/ldb_test.py
ldb_tests: ldb
python tools/ldb_test.py
@ -236,6 +303,10 @@ valgrind_check: all $(PROGRAMS) $(TESTS)
echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \
done
analyze:
$(MAKE) clean
$(CLANG_SCAN_BUILD) --use-analyzer=$(CLANG_ANALYZER) -o $(CURDIR)/scan_build_report $(MAKE) all -j32
unity.cc:
$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/unity" "$(CURDIR)/unity.cc"))
@ -243,10 +314,11 @@ unity: unity.cc unity.o
$(CXX) unity.o $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
clean:
-rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk unity.cc
-rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) make_config.mk unity.cc
-rm -rf ios-x86/* ios-arm/*
-find . -name "*.[od]" -exec rm {} \;
-find . -name "*.[oda]" -exec rm {} \;
-find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
-rm -rf bzip2* snappy* zlib*
tags:
ctags * -R
cscope -b `find . -name '*.cc'` `find . -name '*.h'`
@ -254,6 +326,9 @@ tags:
format:
build_tools/format-diff.sh
package:
bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR)
# ---------------------------------------------------------------------------
# Unit tests and tools
# ---------------------------------------------------------------------------
@ -264,6 +339,12 @@ $(LIBRARY): $(LIBOBJECTS)
db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
memtablerep_bench: db/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) db/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -276,9 +357,6 @@ db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL)
db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
signal_test: util/signal_test.o $(LIBOBJECTS)
$(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -309,9 +387,6 @@ cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL)
$(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -333,6 +408,10 @@ corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
slice_transform_test: util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -345,8 +424,8 @@ log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS)
plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
comparator_db_test: db/comparator_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/comparator_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
@ -378,20 +457,35 @@ ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
flush_job_test: db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
compaction_job_test: db/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
wal_manager_test: db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
fault_injection_test: db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
full_filter_block_test: table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -411,6 +505,12 @@ version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
compaction_picker_test: db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
version_builder_test: db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
file_indexer_test : db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -420,9 +520,15 @@ reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
merger_test: table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
@ -438,15 +544,26 @@ cuckoo_table_reader_test: table/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTH
cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
options_test: util/options_test.o $(LIBOBJECTS) $(TESTHARNESS)
listener_test: db/listener_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/listener_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
thread_list_test: util/thread_list_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/thread_list_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
compactor_test: utilities/compaction/compactor_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/compaction/compactor_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
options_test: util/options_test.o util/options_helper.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/options_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
rm -f $@
$(AR) -rs $@ $(MEMENVOBJECTS)
sst_dump_test: util/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
memenv_test : helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
memenv_test : util/memenv_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/memenv_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
mock_env_test : util/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -475,24 +592,83 @@ ldb: tools/ldb.o $(LIBOBJECTS)
JNI_NATIVE_SOURCES = ./java/rocksjni/*.cc
JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
ROCKSDBJNILIB = librocksdbjni.so
ROCKSDB_JAR = rocksdbjni.jar
ARCH := $(shell getconf LONG_BIT)
ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar
ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar
ifeq ($(PLATFORM), OS_MACOSX)
ROCKSDBJNILIB = librocksdbjni.jnilib
JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
ROCKSDBJNILIB = librocksdbjni-osx.jnilib
ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
ifneq ("$(wildcard $(JAVA_HOME)/include/darwin)","")
JAVA_INCLUDE = -I$(JAVA_HOME)/include -I $(JAVA_HOME)/include/darwin
else
JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
endif
endif
libz.a:
-rm -rf zlib-1.2.8
curl -O http://zlib.net/zlib-1.2.8.tar.gz
tar xvzf zlib-1.2.8.tar.gz
cd zlib-1.2.8 && CFLAGS='-fPIC' ./configure --static && make
cp zlib-1.2.8/libz.a .
libbz2.a:
-rm -rf bzip2-1.0.6
curl -O http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz
tar xvzf bzip2-1.0.6.tar.gz
cd bzip2-1.0.6 && make CFLAGS='-fPIC -Wall -Winline -O2 -g -D_FILE_OFFSET_BITS=64'
cp bzip2-1.0.6/libbz2.a .
libsnappy.a:
-rm -rf snappy-1.1.1
curl -O https://snappy.googlecode.com/files/snappy-1.1.1.tar.gz
tar xvzf snappy-1.1.1.tar.gz
cd snappy-1.1.1 && ./configure --with-pic --enable-static
cd snappy-1.1.1 && make
cp snappy-1.1.1/.libs/libsnappy.a .
rocksdbjavastatic: libz.a libbz2.a libsnappy.a
OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j
cd java;$(MAKE) javalib;
rm -f ./java/target/$(ROCKSDBJNILIB)
$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a
cd java/target;strip -S -x $(ROCKSDBJNILIB)
cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
cd java/target/apidocs;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) *
cd java/src/main/java;jar -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
rocksdbjavastaticrelease: rocksdbjavastatic
cd java/crossbuild && vagrant destroy -f && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64
cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
rocksdbjavastaticpublish: rocksdbjavastaticrelease
mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar -Dclassifier=javadoc
mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources
mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64
mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32
mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx
mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
rocksdbjava:
OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32
cd java;$(MAKE) java;
rm -f ./java/$(ROCKSDBJNILIB)
$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
cd java;$(MAKE) javalib;
rm -f ./java/target/$(ROCKSDBJNILIB)
$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
jclean:
cd java;$(MAKE) clean;
rm -f $(ROCKSDBJNILIB)
jtest:
cd java;$(MAKE) sample;$(MAKE) test;
@ -565,8 +741,12 @@ ifneq ($(MAKECMDGOALS),clean)
ifneq ($(MAKECMDGOALS),format)
ifneq ($(MAKECMDGOALS),jclean)
ifneq ($(MAKECMDGOALS),jtest)
ifneq ($(MAKECMDGOALS),package)
ifneq ($(MAKECMDGOALS),analyze)
-include $(DEPFILES)
endif
endif
endif
endif
endif
endif

@ -3,7 +3,7 @@
[![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb)
RocksDB is developed and maintained by Facebook Database Engineering Team.
It is built on on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com)
It is built on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com)
and Jeff Dean (jeff@google.com)
This code is a library that forms the core building block for a fast

16
Vagrantfile vendored

@ -0,0 +1,16 @@
Vagrant.configure("2") do |config|
config.vm.provider "virtualbox" do |v|
v.memory = 4096
v.cpus = 2
end
config.vm.define "ubuntu14" do |box|
box.vm.box = "ubuntu/trusty64"
end
config.vm.define "centos65" do |box|
box.vm.box = "chef/centos-6.5"
end
end

@ -32,7 +32,7 @@
# 2. Once install, add the include path/lib path for gflags to CPATH and
# LIBRARY_PATH respectively. If installed with default mode, the
# lib and include path will be /usr/local/lib and /usr/local/include
# Mac user can do this by running build_tools/mac-install-gflags.sh
# Mac user can do this by having brew installed and running brew install gflags
OUTPUT=$1
if test -z "$OUTPUT"; then
@ -46,18 +46,15 @@ PLATFORM_CXXFLAGS="-std=c++11"
COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
# Default to fbcode gcc on internal fb machines
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
FBCODE_BUILD="true"
if [ -z "$USE_CLANG" ]; then
CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \
$(rpm -q --whatprovides redhat-release)`
if [ "$CENTOS_VERSION" = "6" ]; then
source "$PWD/build_tools/fbcode.gcc481.sh"
else
source "$PWD/build_tools/fbcode.gcc471.sh"
fi
# If we're compiling with TSAN we need pic build
PIC_BUILD=$COMPILE_WITH_TSAN
if [ -z "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then
source "$PWD/build_tools/fbcode_config.sh"
else
source "$PWD/build_tools/fbcode.clang31.sh"
# we need this to build with MySQL. Don't use for other purposes.
source "$PWD/build_tools/fbcode_config4.8.1.sh"
fi
fi
@ -78,6 +75,14 @@ if test -z "$TARGET_OS"; then
TARGET_OS=`uname -s`
fi
if test -z "$CLANG_SCAN_BUILD"; then
CLANG_SCAN_BUILD=scan-build
fi
if test -z "$CLANG_ANALYZER"; then
CLANG_ANALYZER=$(which clang++)
fi
COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}"
CROSS_COMPILE=
PLATFORM_CCFLAGS=
@ -164,7 +169,7 @@ if test -z "$DO_NOT_RUN_BUILD_DETECT_VERSION"; then
"$PWD/build_tools/build_detect_version"
fi
# We want to make a list of all cc files within util, db, table, and helpers
# We want to make a list of all cc files within util, db and table
# except for the test and benchmark files. By default, find will output a list
# of all files matching either rule, so we need to append -print to make the
# prune take effect.
@ -173,36 +178,27 @@ DIRS="util db table utilities"
set -f # temporarily disable globbing so that our patterns arent expanded
PRUNE_TEST="-name *test*.cc -prune"
PRUNE_BENCH="-name *bench*.cc -prune"
PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
PORTABLE_CPP=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "`
PRUNE_MOCK="-name *mock*.cc -prune"
PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_MOCK -o -name '*.cc' -print | sort | tr "\n" " "`
MOCK_SOURCES=`cd "$ROCKSDB_ROOT"; find $DIRS -name '*mock*.cc' -print | grep -v "test" | sort | tr "\n" " "`
set +f # re-enable globbing
# The sources consist of the portable files, plus the platform-specific port
# file.
echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> "$OUTPUT"
echo "SOURCESCPP=$PORTABLE_CPP" >> "$OUTPUT"
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> "$OUTPUT"
echo "MOCK_SOURCES=$MOCK_SOURCES" >> "$OUTPUT"
if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
# Cross-compiling; do not try any compilation tests.
# Also don't need any compilation tests if compiling on fbcode
true
else
# If -std=c++0x works, use <atomic>. Otherwise use port_posix.h.
$CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <atomic>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT"
fi
# Test whether fallocate is available
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <fcntl.h>
int main() {
int fd = open("/dev/null", 0);
fallocate(fd, 0, 0, 1024);
fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, 1024);
}
EOF
if [ "$?" = 0 ]; then
@ -302,6 +298,14 @@ EOF
fi
fi
# Test whether -Wshorten-64-to-32 is available
$CXX $CFLAGS -x c++ - -o /dev/null -Wshorten-64-to-32 2>/dev/null <<EOF
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -Wshorten-64-to-32"
fi
# shall we use HDFS?
if test "$USE_HDFS"; then
@ -318,14 +322,22 @@ if test "$USE_HDFS"; then
JAVA_LDFLAGS="$JAVA_LDFLAGS $HDFS_LDFLAGS"
fi
# if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 "
COMMON_FLAGS="$COMMON_FLAGS $USE_SSE"
if test "$USE_SSE"; then
# if Intel SSE instruction set is supported, set USE_SSE=1
COMMON_FLAGS="$COMMON_FLAGS -msse -msse4.2 "
elif test -z "$PORTABLE"; then
COMMON_FLAGS="$COMMON_FLAGS -march=native "
fi
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
VALGRIND_VER="$VALGRIND_VER"
ROCKSDB_MAJOR=`build_tools/version.sh major`
ROCKSDB_MINOR=`build_tools/version.sh minor`
ROCKSDB_PATCH=`build_tools/version.sh patch`
echo "CC=$CC" >> "$OUTPUT"
echo "CXX=$CXX" >> "$OUTPUT"
echo "PLATFORM=$PLATFORM" >> "$OUTPUT"
@ -341,3 +353,8 @@ echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> "$OUTPUT"
echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> "$OUTPUT"
echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> "$OUTPUT"
echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT"
echo "ROCKSDB_MAJOR=$ROCKSDB_MAJOR" >> "$OUTPUT"
echo "ROCKSDB_MINOR=$ROCKSDB_MINOR" >> "$OUTPUT"
echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT"
echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" >> "$OUTPUT"
echo "CLANG_ANALYZER=$CLANG_ANALYZER" >> "$OUTPUT"

@ -1,74 +0,0 @@
#!/bin/sh
#
# Set environment variables so that we can compile leveldb using
# fbcode settings. It uses the latest g++ compiler and also
# uses jemalloc
TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
GLIBC_RUNTIME_PATH=/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1
# location of libgcc
LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
# location of glibc
GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
# location of snappy headers and libraries
SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
# location of zlib headers and libraries
ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
# location of bzip headers and libraries
BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
# use Intel SSE support for checksum calculations
export USE_SSE=" -msse -msse4.2 "
CC="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang $CLANG_INCLUDES"
CXX="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang++ $CLANG_INCLUDES $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin -nostdlib "
CFLAGS+=" -nostdinc -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1 "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/x86_64-facebook-linux "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/backward "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/clang/clang-3.2/0b7c69d/lib/clang/3.2/include "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include/linux "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include "
CFLAGS+=" -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
CXXFLAGS="$CFLAGS -nostdinc++"
CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,$GLIBC_RUNTIME_PATH/lib/ld-linux-x86-64.so.2"
EXEC_LDFLAGS+=" -B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin"
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED

@ -1,70 +0,0 @@
#!/bin/sh
#
# Set environment variables so that we can compile leveldb using
# fbcode settings. It uses the latest g++ compiler and also
# uses jemalloc
TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
# location of libhdfs libraries
if test "$USE_HDFS"; then
JAVA_HOME="/usr/local/jdk-6u22-64"
JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1"
HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
HDFSLIB+=" -ldl -lverify -ljava -ljvm "
fi
# location of libgcc
LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
# location of glibc
GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
# location of snappy headers and libraries
SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
# location of zlib headers and libraries
ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
# location of bzip headers and libraries
BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
# use Intel SSE support for checksum calculations
export USE_SSE=" -msse -msse4.2 "
CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/gcc"
CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2"
EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/91ddd43/bin/"
export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER

@ -1,86 +0,0 @@
#!/bin/sh
#
# Set environment variables so that we can compile rocksdb using
# fbcode settings. It uses the latest g++ compiler and also
# uses jemalloc
TOOLCHAIN_REV=53dc1fe83f84e9145b9ffb81b81aa7f6a49c87cc
CENTOS_VERSION=`rpm -q --qf "%{VERSION}" $(rpm -q --whatprovides redhat-release)`
if [ "$CENTOS_VERSION" = "6" ]; then
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos6-native"
else
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
fi
TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17"
# location of libhdfs libraries
if test "$USE_HDFS"; then
JAVA_HOME="/usr/local/jdk-6u22-64"
JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17"
HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
HDFSLIB+=" -ldl -lverify -ljava -ljvm "
fi
# location of libgcc
LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/include"
LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/libs"
# location of glibc
GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/include"
GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/lib"
# location of snappy headers and libraries
SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/include"
SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/lib/libsnappy.a"
# location of zlib headers and libraries
ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/include"
ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/lib/libz.a"
# location of bzip headers and libraries
BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/include"
BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/lib/libbz2.a"
LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include"
LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/include"
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a"
# location of jemalloc
JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/"
JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a"
# location of numa
NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65
NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/"
NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a"
# use Intel SSE support for checksum calculations
export USE_SSE=" -msse -msse4.2 "
CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc"
CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a"
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/c3f970a/bin/"
export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE

@ -0,0 +1,125 @@
#!/bin/sh
#
# Set environment variables so that we can compile rocksdb using
# fbcode settings. It uses the latest g++ and clang compilers and also
# uses jemalloc
# Environment variables that change the behavior of this script:
# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
CFLAGS=""
# location of libgcc
LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/0473c80518a10d6efcbe24c5eeca3fb4ec9b519c/4.9.x/gcc-4.9-glibc-2.20/e1a7e4e"
LIBGCC_INCLUDE="$LIBGCC_BASE/include"
LIBGCC_LIBS=" -L $LIBGCC_BASE/libs"
# location of glibc
GLIBC_REV=7397bed99280af5d9543439cdb7d018af7542720
GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/include"
GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/lib"
# location of snappy headers and libraries
SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/include/"
if test -z $PIC_BUILD; then
SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy.a"
else
SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy_pic.a"
fi
CFLAGS+=" -DSNAPPY"
if test -z $PIC_BUILD; then
# location of zlib headers and libraries
ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/include/"
ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/lib/libz.a"
CFLAGS+=" -DZLIB"
# location of bzip headers and libraries
BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/include/"
BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/lib/libbz2.a"
CFLAGS+=" -DBZIP2"
LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/include/"
LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/lib/liblz4.a"
CFLAGS+=" -DLZ4"
fi
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/include/"
if test -z $PIC_BUILD; then
GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags.a"
else
GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags_pic.a"
fi
CFLAGS+=" -DGFLAGS=google"
# location of jemalloc
JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/include/"
JEMALLOC_LIB=" -Wl,--whole-archive /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/lib/libjemalloc.a"
if test -z $PIC_BUILD; then
# location of numa
NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/include/"
NUMA_LIB=" /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/lib/libnuma.a"
CFLAGS+=" -DNUMA"
# location of libunwind
LIBUNWIND="/mnt/gvfs/third-party2/libunwind/1de3b75e0afedfe5585b231bbb340ec7a1542335/1.1/gcc-4.9-glibc-2.20/34235e8/lib/libunwind.a"
fi
# use Intel SSE support for checksum calculations
export USE_SSE=1
BINUTILS="/mnt/gvfs/third-party2/binutils/0b6ad0c88ddd903333a48ae8bff134efac468e4a/2.25/centos6-native/da39a3e/bin"
AR="$BINUTILS/ar"
DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
GCC_BASE="/mnt/gvfs/third-party2/gcc/1c67a0b88f64d4d9ced0382d141c76aaa7d62fba/4.9.x/centos6-native/1317bc4"
STDLIBS="-L $GCC_BASE/lib64"
CLANG_BASE="/mnt/gvfs/third-party2/clang/290704c112bf894bf4a30d7bbd1be81e34998473/dev"
CLANG_ANALYZER="$CLANG_BASE/centos6-native/af4b1a0/bin/clang++"
CLANG_SCAN_BUILD="$CLANG_BASE/src/clang/tools/scan-build/scan-build"
if [ -z "$USE_CLANG" ]; then
# gcc
CC="$GCC_BASE/bin/gcc"
CXX="$GCC_BASE/bin/g++"
CFLAGS+=" -B$BINUTILS/gold"
CFLAGS+=" -isystem $GLIBC_INCLUDE"
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
else
# clang
CLANG_INCLUDE="$CLANG_BASE/gcc-4.9-glibc-2.20/74c386f/lib/clang/dev/include/"
CC="$CLANG_BASE/centos6-native/af4b1a0/bin/clang"
CXX="$CLANG_BASE/centos6-native/af4b1a0/bin/clang++"
KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/ffd14f660a43c4b92717986b1bba66722ef089d0/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.9-glibc-2.20/da39a3e/include"
CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib"
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x "
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x/x86_64-facebook-linux "
CFLAGS+=" -isystem $GLIBC_INCLUDE"
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
CFLAGS+=" -isystem $CLANG_INCLUDE"
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
CXXFLAGS="-nostdinc++"
fi
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
CXXFLAGS+=" $CFLAGS"
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.9-glibc-2.20/lib/ld.so"
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $LIBUNWIND"
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/6c45ef049cbf11c2df593addb712cd891049e737/3.10.0/gcc-4.9-glibc-2.20/4230243/bin/"
export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD

@ -0,0 +1,105 @@
#!/bin/sh
#
# Set environment variables so that we can compile rocksdb using
# fbcode settings. It uses the latest g++ compiler and also
# uses jemalloc
# location of libgcc
LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/7712e757d7355cb51292454ee0b7b46a467fdfed/4.8.1/gcc-4.8.1-glibc-2.17/8aac7fc"
LIBGCC_INCLUDE="$LIBGCC_BASE/include"
LIBGCC_LIBS=" -L $LIBGCC_BASE/libs"
# location of glibc
GLIBC_REV=6e40560b4e0b6d690fd1cf8c7a43ad7452b04cfa
GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/include"
GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/lib"
# location of snappy headers and libraries
SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/include"
SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/lib/libsnappy.a"
# location of zlib headers and libraries
ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/include"
ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/lib/libz.a"
# location of bzip headers and libraries
BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/include/"
BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libbz2.a"
LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include"
LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/include/"
GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libgflags.a"
# location of jemalloc
JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/c60d854f7824f334195fe7fd34b2bc9057e3c1f9/3.6.0/gcc-4.8.1-glibc-2.17/4d53c6f/include"
JEMALLOC_LIB=" -Wl,--whole-archive /mnt/gvfs/third-party2/jemalloc/c60d854f7824f334195fe7fd34b2bc9057e3c1f9/3.6.0/gcc-4.8.1-glibc-2.17/4d53c6f/lib/libjemalloc.a"
# location of numa
NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65
NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/"
NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a"
# location of libunwind
LIBUNWIND_REV=2c060e64064559905d46fd194000d61592087bdc
LIBUNWIND="/mnt/gvfs/third-party2/libunwind/$LIBUNWIND_REV/1.1/gcc-4.8.1-glibc-2.17/675d945/lib/libunwind.a"
# use Intel SSE support for checksum calculations
export USE_SSE=1
BINUTILS="/mnt/gvfs/third-party2/binutils/2aff2e7b474cd3e6ab23495ad1224b7d214b9f8e/2.21.1/centos6-native/da39a3e/bin"
AR="$BINUTILS/ar"
DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
GCC_BASE="/mnt/gvfs/third-party2/gcc/1ec615e23800f0815d474478ba476a0adc3fe788/4.8.1/centos6-native/cc6c9dc"
STDLIBS="-L $GCC_BASE/lib64"
if [ -z "$USE_CLANG" ]; then
# gcc
CC="$GCC_BASE/bin/gcc"
CXX="$GCC_BASE/bin/g++"
CFLAGS="-B$BINUTILS/gold -m64 -mtune=generic"
CFLAGS+=" -isystem $GLIBC_INCLUDE"
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
else
# clang
CLANG_BASE="/mnt/gvfs/third-party2/clang/9ab68376f938992c4eb5946ca68f90c3185cffc8/3.4"
CLANG_INCLUDE="$CLANG_BASE/gcc-4.8.1-glibc-2.17/fb0f730/lib/clang/3.4/include"
CC="$CLANG_BASE/centos6-native/9cefd8a/bin/clang"
CXX="$CLANG_BASE/centos6-native/9cefd8a/bin/clang++"
KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/a683ed7135276731065a9d76d3016c9731f4e2f9/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.8.1-glibc-2.17/da39a3e/include/"
CFLAGS="-B$BINUTILS/gold -nostdinc -nostdlib"
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1 "
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1/x86_64-facebook-linux "
CFLAGS+=" -isystem $GLIBC_INCLUDE"
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
CFLAGS+=" -isystem $CLANG_INCLUDE"
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
CXXFLAGS="-nostdinc++"
fi
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
CXXFLAGS+=" $CFLAGS"
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $LIBUNWIND"
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
VALGRIND_REV=b2a9f85e4b70cd03abc85a7f3027fbc4cef35bd0
VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/$VALGRIND_REV/3.8.1/gcc-4.8.1-glibc-2.17/c3f970a/bin/"
export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE

@ -1,25 +0,0 @@
#!/bin/sh
# Install gflags for mac developers.
set -e
DIR=`mktemp -d /tmp/rocksdb_gflags_XXXX`
cd $DIR
wget https://gflags.googlecode.com/files/gflags-2.0.tar.gz
tar xvfz gflags-2.0.tar.gz
cd gflags-2.0
./configure
make
make install
# Add include/lib path for g++
echo 'export LIBRARY_PATH+=":/usr/local/lib"' >> ~/.bash_profile
echo 'export CPATH+=":/usr/local/include"' >> ~/.bash_profile
echo ""
echo "-----------------------------------------------------------------------------"
echo "| Installation Completed |"
echo "-----------------------------------------------------------------------------"
echo "Please run \`. ~/.bash_profile\` to be able to compile with gflags"

@ -0,0 +1,116 @@
#/usr/bin/env bash
set -e
function log() {
echo "[+] $1"
}
function fatal() {
echo "[!] $1"
exit 1
}
function platform() {
local __resultvar=$1
if [[ -f "/etc/yum.conf" ]]; then
eval $__resultvar="centos"
elif [[ -f "/etc/dpkg/dpkg.cfg" ]]; then
eval $__resultvar="ubuntu"
else
fatal "Unknwon operating system"
fi
}
platform OS
function package() {
if [[ $OS = "ubuntu" ]]; then
if dpkg --get-selections | grep --quiet $1; then
log "$1 is already installed. skipping."
else
apt-get install $@ -y
fi
elif [[ $OS = "centos" ]]; then
if rpm -qa | grep --quiet $1; then
log "$1 is already installed. skipping."
else
yum install $@ -y
fi
fi
}
function detect_fpm_output() {
if [[ $OS = "ubuntu" ]]; then
export FPM_OUTPUT=deb
elif [[ $OS = "centos" ]]; then
export FPM_OUTPUT=rpm
fi
}
detect_fpm_output
function gem_install() {
if gem list | grep --quiet $1; then
log "$1 is already installed. skipping."
else
gem install $@
fi
}
function main() {
if [[ $# -ne 1 ]]; then
fatal "Usage: $0 <rocksdb_version>"
else
log "using rocksdb version: $1"
fi
if [[ -d /vagrant ]]; then
if [[ $OS = "ubuntu" ]]; then
package g++-4.7
export CXX=g++-4.7
# the deb would depend on libgflags2, but the static lib is the only thing
# installed by make install
package libgflags-dev
package ruby-all-dev
elif [[ $OS = "centos" ]]; then
pushd /etc/yum.repos.d
if [[ ! -f /etc/yum.repos.d/devtools-1.1.repo ]]; then
wget http://people.centos.org/tru/devtools-1.1/devtools-1.1.repo
fi
package devtoolset-1.1-gcc --enablerepo=testing-1.1-devtools-6
package devtoolset-1.1-gcc-c++ --enablerepo=testing-1.1-devtools-6
export CC=/opt/centos/devtoolset-1.1/root/usr/bin/gcc
export CPP=/opt/centos/devtoolset-1.1/root/usr/bin/cpp
export CXX=/opt/centos/devtoolset-1.1/root/usr/bin/c++
export PATH=$PATH:/opt/centos/devtoolset-1.1/root/usr/bin
popd
if ! rpm -qa | grep --quiet gflags; then
rpm -i https://github.com/schuhschuh/gflags/releases/download/v2.1.0/gflags-devel-2.1.0-1.amd64.rpm
fi
package ruby
package ruby-devel
package rubygems
package rpm-build
fi
fi
gem_install fpm
make static_lib
make install INSTALL_PATH=package
fpm \
-s dir \
-t $FPM_OUTPUT \
-n rocksdb \
-v $1 \
--prefix /usr \
--url http://rocksdb.org/ \
-m rocksdb@fb.com \
--license BSD \
--vendor Facebook \
--description "RocksDB is an embeddable persistent key-value store for fast storage." \
package
}
main $@

@ -344,6 +344,38 @@ common_in_mem_args="--db=/dev/shm/rocksdb \
--threads=32 \
--writes_per_second=81920 > ${STAT_FILE}.seekwhilewriting_in_ram
# measure fillseq with bunch of column families
./db_bench \
--benchmarks=fillseq \
--num_column_families=500 \
--write_buffer_size=1048576 \
--db=$DATA_DIR \
--use_existing_db=0 \
--num=$NUM \
--writes=$NUM \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 > ${STAT_FILE}.fillseq_lots_column_families
# measure overwrite performance with bunch of column families
./db_bench \
--benchmarks=overwrite \
--num_column_families=500 \
--write_buffer_size=1048576 \
--db=$DATA_DIR \
--use_existing_db=1 \
--num=$NUM \
--writes=$((NUM / 10)) \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=8 > ${STAT_FILE}.overwrite_lots_column_families
# send data to ods
function send_to_ods {
@ -392,3 +424,5 @@ send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadr
send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
send_benchmark_to_ods readwhilewriting readwhilewriting_in_ram ${STAT_FILE}.readwhilewriting_in_ram
send_benchmark_to_ods seekrandomwhilewriting seekwhilewriting_in_ram ${STAT_FILE}.seekwhilewriting_in_ram
send_benchmark_to_ods fillseq fillseq_lots_column_families ${STAT_FILE}.fillseq_lots_column_families
send_benchmark_to_ods overwrite overwrite_lots_column_families ${STAT_FILE}.overwrite_lots_column_families

@ -54,7 +54,7 @@ case "$TARGET_OS" in
exit 1
esac
# We want to make a list of all cc files within util, db, table, and helpers
# We want to make a list of all cc files within util, db and table
# except for the test and benchmark files. By default, find will output a list
# of all files matching either rule, so we need to append -print to make the
# prune take effect.

@ -0,0 +1,14 @@
#!/bin/sh
if [ "$#" = "0" ]; then
echo "Usage: $0 major|minor|patch"
exit 1
fi
if [ "$1" = "major" ]; then
cat include/rocksdb/version.h | grep MAJOR | head -n1 | awk '{print $3}'
fi
if [ "$1" = "minor" ]; then
cat include/rocksdb/version.h | grep MINOR | head -n1 | awk '{print $3}'
fi
if [ "$1" = "patch" ]; then
cat include/rocksdb/version.h | grep PATCH | head -n1 | awk '{print $3}'
fi

@ -11,8 +11,8 @@ fi
ROOT=".."
# Fetch right version of gcov
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
source $ROOT/build_tools/fbcode.gcc471.sh
GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov
source $ROOT/build_tools/fbcode_config.sh
GCOV=$GCC_BASE/bin/gcov
else
GCOV=$(which gcov)
fi

@ -26,21 +26,24 @@ namespace rocksdb {
class TableFactory;
TableBuilder* NewTableBuilder(const Options& options,
TableBuilder* NewTableBuilder(const ImmutableCFOptions& ioptions,
const InternalKeyComparator& internal_comparator,
WritableFile* file,
CompressionType compression_type) {
return options.table_factory->NewTableBuilder(options, internal_comparator,
file, compression_type);
const CompressionType compression_type,
const CompressionOptions& compression_opts) {
return ioptions.table_factory->NewTableBuilder(
ioptions, internal_comparator, file, compression_type, compression_opts);
}
Status BuildTable(const std::string& dbname, Env* env, const Options& options,
const EnvOptions& soptions, TableCache* table_cache,
Status BuildTable(const std::string& dbname, Env* env,
const ImmutableCFOptions& ioptions,
const EnvOptions& env_options, TableCache* table_cache,
Iterator* iter, FileMetaData* meta,
const InternalKeyComparator& internal_comparator,
const SequenceNumber newest_snapshot,
const SequenceNumber earliest_seqno_in_memtable,
const CompressionType compression,
const CompressionOptions& compression_opts,
const Env::IOPriority io_priority) {
Status s;
meta->fd.file_size = 0;
@ -50,33 +53,36 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
// If the sequence number of the smallest entry in the memtable is
// smaller than the most recent snapshot, then we do not trigger
// removal of duplicate/deleted keys as part of this builder.
bool purge = options.purge_redundant_kvs_while_flush;
bool purge = ioptions.purge_redundant_kvs_while_flush;
if (earliest_seqno_in_memtable <= newest_snapshot) {
purge = false;
}
std::string fname = TableFileName(options.db_paths, meta->fd.GetNumber(),
std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(),
meta->fd.GetPathId());
if (iter->Valid()) {
unique_ptr<WritableFile> file;
s = env->NewWritableFile(fname, &file, soptions);
s = env->NewWritableFile(fname, &file, env_options);
if (!s.ok()) {
return s;
}
file->SetIOPriority(io_priority);
TableBuilder* builder =
NewTableBuilder(options, internal_comparator, file.get(), compression);
TableBuilder* builder = NewTableBuilder(
ioptions, internal_comparator, file.get(),
compression, compression_opts);
// the first key is the smallest key
Slice key = iter->key();
meta->smallest.DecodeFrom(key);
meta->smallest_seqno = GetInternalKeySeqno(key);
meta->largest_seqno = meta->smallest_seqno;
{
// the first key is the smallest key
Slice key = iter->key();
meta->smallest.DecodeFrom(key);
meta->smallest_seqno = GetInternalKeySeqno(key);
meta->largest_seqno = meta->smallest_seqno;
}
MergeHelper merge(internal_comparator.user_comparator(),
options.merge_operator.get(), options.info_log.get(),
options.min_partial_merge_operands,
ioptions.merge_operator, ioptions.info_log,
ioptions.min_partial_merge_operands,
true /* internal key corruption is not ok */);
if (purge) {
@ -196,12 +202,12 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
delete builder;
// Finish and check for file errors
if (s.ok() && !options.disableDataSync) {
if (options.use_fsync) {
StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
if (s.ok() && !ioptions.disable_data_sync) {
if (ioptions.use_fsync) {
StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
s = file->Fsync();
} else {
StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
s = file->Sync();
}
}
@ -211,7 +217,7 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
if (s.ok()) {
// Verify that the table is usable
Iterator* it = table_cache->NewIterator(ReadOptions(), soptions,
Iterator* it = table_cache->NewIterator(ReadOptions(), env_options,
internal_comparator, meta->fd);
s = it->status();
delete it;

@ -11,6 +11,7 @@
#include "rocksdb/status.h"
#include "rocksdb/types.h"
#include "rocksdb/options.h"
#include "rocksdb/immutable_options.h"
namespace rocksdb {
@ -26,8 +27,10 @@ class TableBuilder;
class WritableFile;
extern TableBuilder* NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_comparator,
WritableFile* file, CompressionType compression_type);
const ImmutableCFOptions& options,
const InternalKeyComparator& internal_comparator,
WritableFile* file, const CompressionType compression_type,
const CompressionOptions& compression_opts);
// Build a Table file from the contents of *iter. The generated file
// will be named according to number specified in meta. On success, the rest of
@ -35,13 +38,15 @@ extern TableBuilder* NewTableBuilder(
// If no data is present in *iter, meta->file_size will be set to
// zero, and no Table file will be produced.
extern Status BuildTable(const std::string& dbname, Env* env,
const Options& options, const EnvOptions& soptions,
const ImmutableCFOptions& options,
const EnvOptions& env_options,
TableCache* table_cache, Iterator* iter,
FileMetaData* meta,
const InternalKeyComparator& internal_comparator,
const SequenceNumber newest_snapshot,
const SequenceNumber earliest_seqno_in_memtable,
const CompressionType compression,
const CompressionOptions& compression_opts,
const Env::IOPriority io_priority = Env::IO_HIGH);
} // namespace rocksdb

@ -29,6 +29,7 @@
#include "rocksdb/statistics.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/table.h"
#include "rocksdb/utilities/backupable_db.h"
using rocksdb::Cache;
using rocksdb::ColumnFamilyDescriptor;
@ -56,6 +57,7 @@ using rocksdb::NewBloomFilterPolicy;
using rocksdb::NewLRUCache;
using rocksdb::Options;
using rocksdb::BlockBasedTableOptions;
using rocksdb::CuckooTableOptions;
using rocksdb::RandomAccessFile;
using rocksdb::Range;
using rocksdb::ReadOptions;
@ -68,21 +70,32 @@ using rocksdb::WritableFile;
using rocksdb::WriteBatch;
using rocksdb::WriteOptions;
using rocksdb::LiveFileMetaData;
using rocksdb::BackupEngine;
using rocksdb::BackupableDBOptions;
using rocksdb::BackupInfo;
using rocksdb::RestoreOptions;
using std::shared_ptr;
extern "C" {
struct rocksdb_t { DB* rep; };
struct rocksdb_backup_engine_t { BackupEngine* rep; };
struct rocksdb_backup_engine_info_t { std::vector<BackupInfo> rep; };
struct rocksdb_restore_options_t { RestoreOptions rep; };
struct rocksdb_iterator_t { Iterator* rep; };
struct rocksdb_writebatch_t { WriteBatch rep; };
struct rocksdb_snapshot_t { const Snapshot* rep; };
struct rocksdb_flushoptions_t { FlushOptions rep; };
struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; };
struct rocksdb_readoptions_t { ReadOptions rep; };
struct rocksdb_readoptions_t {
ReadOptions rep;
Slice upper_bound; // stack variable to set pointer to in ReadOptions
};
struct rocksdb_writeoptions_t { WriteOptions rep; };
struct rocksdb_options_t { Options rep; };
struct rocksdb_block_based_table_options_t { BlockBasedTableOptions rep; };
struct rocksdb_cuckoo_table_options_t { CuckooTableOptions rep; };
struct rocksdb_seqfile_t { SequentialFile* rep; };
struct rocksdb_randomfile_t { RandomAccessFile* rep; };
struct rocksdb_writablefile_t { WritableFile* rep; };
@ -118,7 +131,7 @@ struct rocksdb_compactionfilter_t : public CompactionFilter {
const Slice& existing_value,
std::string* new_value,
bool* value_changed) const {
char* c_new_value = NULL;
char* c_new_value = nullptr;
size_t new_value_length = 0;
unsigned char c_value_changed = 0;
unsigned char result = (*filter_)(
@ -385,11 +398,9 @@ struct rocksdb_mergeoperator_t : public MergeOperator {
unsigned char success;
size_t new_value_len;
char* tmp_new_value = (*full_merge_)(
state_,
key.data(), key.size(),
existing_value_data, existing_value_len,
&operand_pointers[0], &operand_sizes[0], n,
&success, &new_value_len);
state_, key.data(), key.size(), existing_value_data, existing_value_len,
&operand_pointers[0], &operand_sizes[0], static_cast<int>(n), &success,
&new_value_len);
new_value->assign(tmp_new_value, new_value_len);
if (delete_value_ != nullptr) {
@ -417,7 +428,7 @@ struct rocksdb_mergeoperator_t : public MergeOperator {
size_t new_value_len;
char* tmp_new_value = (*partial_merge_)(
state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0],
operand_count, &success, &new_value_len);
static_cast<int>(operand_count), &success, &new_value_len);
new_value->assign(tmp_new_value, new_value_len);
if (delete_value_ != nullptr) {
@ -524,6 +535,85 @@ rocksdb_t* rocksdb_open_for_read_only(
return result;
}
rocksdb_backup_engine_t* rocksdb_backup_engine_open(
const rocksdb_options_t* options, const char* path, char** errptr) {
BackupEngine* be;
if (SaveError(errptr, BackupEngine::Open(options->rep.env,
BackupableDBOptions(path), &be))) {
return nullptr;
}
rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
result->rep = be;
return result;
}
void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be,
rocksdb_t* db, char** errptr) {
SaveError(errptr, be->rep->CreateNewBackup(db->rep));
}
rocksdb_restore_options_t* rocksdb_restore_options_create() {
return new rocksdb_restore_options_t;
}
void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) {
delete opt;
}
void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt,
int v) {
opt->rep.keep_log_files = v;
}
void rocksdb_backup_engine_restore_db_from_latest_backup(
rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
const rocksdb_restore_options_t* restore_options, char** errptr) {
SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir),
std::string(wal_dir),
restore_options->rep));
}
const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
rocksdb_backup_engine_t* be) {
rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t;
be->rep->GetBackupInfo(&result->rep);
return result;
}
int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) {
return static_cast<int>(info->rep.size());
}
const int64_t rocksdb_backup_engine_info_timestamp(
const rocksdb_backup_engine_info_t* info, int index) {
return info->rep[index].timestamp;
}
const uint32_t rocksdb_backup_engine_info_backup_id(
const rocksdb_backup_engine_info_t* info, int index) {
return info->rep[index].backup_id;
}
const uint64_t rocksdb_backup_engine_info_size(
const rocksdb_backup_engine_info_t* info, int index) {
return info->rep[index].size;
}
const uint32_t rocksdb_backup_engine_info_number_files(
const rocksdb_backup_engine_info_t* info, int index) {
return info->rep[index].number_files;
}
void rocksdb_backup_engine_info_destroy(
const rocksdb_backup_engine_info_t* info) {
delete info;
}
void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) {
delete be->rep;
delete be;
}
void rocksdb_close(rocksdb_t* db) {
delete db->rep;
delete db;
@ -1123,6 +1213,51 @@ void rocksdb_options_set_block_based_table_factory(
}
rocksdb_cuckoo_table_options_t*
rocksdb_cuckoo_options_create() {
return new rocksdb_cuckoo_table_options_t;
}
void rocksdb_cuckoo_options_destroy(
rocksdb_cuckoo_table_options_t* options) {
delete options;
}
void rocksdb_cuckoo_options_set_hash_ratio(
rocksdb_cuckoo_table_options_t* options, double v) {
options->rep.hash_table_ratio = v;
}
void rocksdb_cuckoo_options_set_max_search_depth(
rocksdb_cuckoo_table_options_t* options, uint32_t v) {
options->rep.max_search_depth = v;
}
void rocksdb_cuckoo_options_set_cuckoo_block_size(
rocksdb_cuckoo_table_options_t* options, uint32_t v) {
options->rep.cuckoo_block_size = v;
}
void rocksdb_cuckoo_options_set_identity_as_first_hash(
rocksdb_cuckoo_table_options_t* options, unsigned char v) {
options->rep.identity_as_first_hash = v;
}
void rocksdb_cuckoo_options_set_use_module_hash(
rocksdb_cuckoo_table_options_t* options, unsigned char v) {
options->rep.use_module_hash = v;
}
void rocksdb_options_set_cuckoo_table_factory(
rocksdb_options_t *opt,
rocksdb_cuckoo_table_options_t* table_options) {
if (table_options) {
opt->rep.table_factory.reset(
rocksdb::NewCuckooTableFactory(table_options->rep));
}
}
rocksdb_options_t* rocksdb_options_create() {
return new rocksdb_options_t;
}
@ -1216,6 +1351,11 @@ void rocksdb_options_set_info_log_level(
opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
}
void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt,
size_t s) {
opt->rep.db_write_buffer_size = s;
}
void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
opt->rep.write_buffer_size = s;
}
@ -1224,6 +1364,10 @@ void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
opt->rep.max_open_files = n;
}
void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n) {
opt->rep.max_total_wal_size = n;
}
void rocksdb_options_set_target_file_size_base(
rocksdb_options_t* opt, uint64_t n) {
opt->rep.target_file_size_base = n;
@ -1355,8 +1499,8 @@ void rocksdb_options_set_purge_redundant_kvs_while_flush(
opt->rep.purge_redundant_kvs_while_flush = v;
}
void rocksdb_options_set_allow_os_buffer(
rocksdb_options_t* opt, unsigned char v) {
void rocksdb_options_set_allow_os_buffer(rocksdb_options_t* opt,
unsigned char v) {
opt->rep.allow_os_buffer = v;
}
@ -1581,11 +1725,6 @@ void rocksdb_options_set_bloom_locality(
opt->rep.bloom_locality = v;
}
void rocksdb_options_set_allow_thread_local(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.allow_thread_local = v;
}
void rocksdb_options_set_inplace_update_support(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.inplace_update_support = v;
@ -1844,6 +1983,19 @@ void rocksdb_readoptions_set_snapshot(
opt->rep.snapshot = (snap ? snap->rep : nullptr);
}
void rocksdb_readoptions_set_iterate_upper_bound(
rocksdb_readoptions_t* opt,
const char* key, size_t keylen) {
if (key == nullptr) {
opt->upper_bound = Slice();
opt->rep.iterate_upper_bound = nullptr;
} else {
opt->upper_bound = Slice(key, keylen);
opt->rep.iterate_upper_bound = &opt->upper_bound;
}
}
void rocksdb_readoptions_set_read_tier(
rocksdb_readoptions_t* opt, int v) {
opt->rep.read_tier = static_cast<rocksdb::ReadTier>(v);
@ -2039,7 +2191,7 @@ void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level
int rocksdb_livefiles_count(
const rocksdb_livefiles_t* lf) {
return lf->rep.size();
return static_cast<int>(lf->rep.size());
}
const char* rocksdb_livefiles_name(

@ -10,9 +10,11 @@
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <inttypes.h>
const char* phase = "";
static char dbname[200];
static char dbbackupname[200];
static void StartPhase(const char* name) {
fprintf(stderr, "=== Test %s\n", name);
@ -132,7 +134,7 @@ static void CmpDestroy(void* arg) { }
static int CmpCompare(void* arg, const char* a, size_t alen,
const char* b, size_t blen) {
int n = (alen < blen) ? alen : blen;
size_t n = (alen < blen) ? alen : blen;
int r = memcmp(a, b, n);
if (r == 0) {
if (alen < blen) r = -1;
@ -346,6 +348,11 @@ int main(int argc, char** argv) {
GetTempDir(),
((int) geteuid()));
snprintf(dbbackupname, sizeof(dbbackupname),
"%s/rocksdb_c_test-%d-backup",
GetTempDir(),
((int) geteuid()));
StartPhase("create_objects");
cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
env = rocksdb_create_default_env();
@ -396,6 +403,41 @@ int main(int argc, char** argv) {
CheckNoError(err);
CheckGet(db, roptions, "foo", "hello");
StartPhase("backup_and_restore");
{
rocksdb_destroy_db(options, dbbackupname, &err);
CheckNoError(err);
rocksdb_backup_engine_t *be = rocksdb_backup_engine_open(options, dbbackupname, &err);
CheckNoError(err);
rocksdb_backup_engine_create_new_backup(be, db, &err);
CheckNoError(err);
rocksdb_delete(db, woptions, "foo", 3, &err);
CheckNoError(err);
rocksdb_close(db);
rocksdb_destroy_db(options, dbname, &err);
CheckNoError(err);
rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create();
rocksdb_restore_options_set_keep_log_files(restore_options, 0);
rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname, restore_options, &err);
CheckNoError(err);
rocksdb_restore_options_destroy(restore_options);
rocksdb_options_set_error_if_exists(options, 0);
db = rocksdb_open(options, dbname, &err);
CheckNoError(err);
rocksdb_options_set_error_if_exists(options, 1);
CheckGet(db, roptions, "foo", "hello");
rocksdb_backup_engine_close(be);
}
StartPhase("compactall");
rocksdb_compact_range(db, NULL, 0, NULL, 0);
CheckGet(db, roptions, "foo", "hello");
@ -576,37 +618,39 @@ int main(int argc, char** argv) {
StartPhase("compaction_filter");
{
rocksdb_options_t* options = rocksdb_options_create();
rocksdb_options_set_create_if_missing(options, 1);
rocksdb_options_t* options_with_filter = rocksdb_options_create();
rocksdb_options_set_create_if_missing(options_with_filter, 1);
rocksdb_compactionfilter_t* cfilter;
cfilter = rocksdb_compactionfilter_create(NULL, CFilterDestroy,
CFilterFilter, CFilterName);
// Create new database
rocksdb_close(db);
rocksdb_destroy_db(options, dbname, &err);
rocksdb_options_set_compaction_filter(options, cfilter);
db = CheckCompaction(db, options, roptions, woptions);
rocksdb_destroy_db(options_with_filter, dbname, &err);
rocksdb_options_set_compaction_filter(options_with_filter, cfilter);
db = CheckCompaction(db, options_with_filter, roptions, woptions);
rocksdb_options_set_compaction_filter(options, NULL);
rocksdb_options_set_compaction_filter(options_with_filter, NULL);
rocksdb_compactionfilter_destroy(cfilter);
rocksdb_options_destroy(options);
rocksdb_options_destroy(options_with_filter);
}
StartPhase("compaction_filter_factory");
{
rocksdb_options_t* options = rocksdb_options_create();
rocksdb_options_set_create_if_missing(options, 1);
rocksdb_options_t* options_with_filter_factory = rocksdb_options_create();
rocksdb_options_set_create_if_missing(options_with_filter_factory, 1);
rocksdb_compactionfilterfactory_t* factory;
factory = rocksdb_compactionfilterfactory_create(
NULL, CFilterFactoryDestroy, CFilterCreate, CFilterFactoryName);
// Create new database
rocksdb_close(db);
rocksdb_destroy_db(options, dbname, &err);
rocksdb_options_set_compaction_filter_factory(options, factory);
db = CheckCompaction(db, options, roptions, woptions);
rocksdb_options_set_compaction_filter_factory(options, NULL);
rocksdb_options_destroy(options);
rocksdb_destroy_db(options_with_filter_factory, dbname, &err);
rocksdb_options_set_compaction_filter_factory(options_with_filter_factory,
factory);
db = CheckCompaction(db, options_with_filter_factory, roptions, woptions);
rocksdb_options_set_compaction_filter_factory(
options_with_filter_factory, NULL);
rocksdb_options_destroy(options_with_filter_factory);
}
StartPhase("compaction_filter_v2");
@ -799,8 +843,87 @@ int main(int argc, char** argv) {
rocksdb_iter_get_error(iter, &err);
CheckNoError(err);
rocksdb_iter_destroy(iter);
rocksdb_close(db);
rocksdb_destroy_db(options, dbname, &err);
}
StartPhase("cuckoo_options");
{
rocksdb_cuckoo_table_options_t* cuckoo_options;
cuckoo_options = rocksdb_cuckoo_options_create();
rocksdb_cuckoo_options_set_hash_ratio(cuckoo_options, 0.5);
rocksdb_cuckoo_options_set_max_search_depth(cuckoo_options, 200);
rocksdb_cuckoo_options_set_cuckoo_block_size(cuckoo_options, 10);
rocksdb_cuckoo_options_set_identity_as_first_hash(cuckoo_options, 1);
rocksdb_cuckoo_options_set_use_module_hash(cuckoo_options, 0);
rocksdb_options_set_cuckoo_table_factory(options, cuckoo_options);
db = rocksdb_open(options, dbname, &err);
CheckNoError(err);
rocksdb_cuckoo_options_destroy(cuckoo_options);
}
StartPhase("iterate_upper_bound");
{
// Create new empty database
rocksdb_close(db);
rocksdb_destroy_db(options, dbname, &err);
CheckNoError(err);
rocksdb_options_set_prefix_extractor(options, NULL);
db = rocksdb_open(options, dbname, &err);
CheckNoError(err);
rocksdb_put(db, woptions, "a", 1, "0", 1, &err); CheckNoError(err);
rocksdb_put(db, woptions, "foo", 3, "bar", 3, &err); CheckNoError(err);
rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err); CheckNoError(err);
rocksdb_put(db, woptions, "g1", 2, "0", 1, &err); CheckNoError(err);
// testing basic case with no iterate_upper_bound and no prefix_extractor
{
rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
rocksdb_iter_seek(iter, "foo", 3);
CheckCondition(rocksdb_iter_valid(iter));
CheckIter(iter, "foo", "bar");
rocksdb_iter_next(iter);
CheckCondition(rocksdb_iter_valid(iter));
CheckIter(iter, "foo1", "bar1");
rocksdb_iter_next(iter);
CheckCondition(rocksdb_iter_valid(iter));
CheckIter(iter, "g1", "0");
rocksdb_iter_destroy(iter);
}
// testing iterate_upper_bound and forward iterator
// to make sure it stops at bound
{
// iterate_upper_bound points beyond the last expected entry
rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4);
rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
rocksdb_iter_seek(iter, "foo", 3);
CheckCondition(rocksdb_iter_valid(iter));
CheckIter(iter, "foo", "bar");
rocksdb_iter_next(iter);
CheckCondition(rocksdb_iter_valid(iter));
CheckIter(iter, "foo1", "bar1");
rocksdb_iter_next(iter);
// should stop here...
CheckCondition(!rocksdb_iter_valid(iter));
rocksdb_iter_destroy(iter);
}
}
StartPhase("cleanup");
rocksdb_close(db);

@ -9,24 +9,65 @@
#include "db/column_family.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <vector>
#include <string>
#include <algorithm>
#include <limits>
#include "db/compaction_picker.h"
#include "db/db_impl.h"
#include "db/job_context.h"
#include "db/version_set.h"
#include "db/writebuffer.h"
#include "db/internal_stats.h"
#include "db/compaction_picker.h"
#include "db/job_context.h"
#include "db/table_properties_collector.h"
#include "db/version_set.h"
#include "db/write_controller.h"
#include "util/autovector.h"
#include "util/hash_skiplist_rep.h"
#include "util/options_helper.h"
namespace rocksdb {
ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
DBImpl* db, port::Mutex* mutex)
: cfd_(cfd), db_(db), mutex_(mutex) {
namespace {
// This function computes the amount of time in microseconds by which a write
// should be delayed based on the number of level-0 files according to the
// following formula:
// if n < bottom, return 0;
// if n >= top, return 1000;
// otherwise, let r = (n - bottom) /
// (top - bottom)
// and return r^2 * 1000.
// The goal of this formula is to gradually increase the rate at which writes
// are slowed. We also tried linear delay (r * 1000), but it seemed to do
// slightly worse. There is no other particular reason for choosing quadratic.
uint64_t SlowdownAmount(int n, double bottom, double top) {
uint64_t delay;
if (n >= top) {
delay = 1000;
} else if (n < bottom) {
delay = 0;
} else {
// If we are here, we know that:
// level0_start_slowdown <= n < level0_slowdown
// since the previous two conditions are false.
double how_much = static_cast<double>(n - bottom) / (top - bottom);
delay = std::max(how_much * how_much * 1000, 100.0);
}
assert(delay <= 1000);
return delay;
}
} // namespace
ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex)
: cfd_(column_family_data), db_(db), mutex_(mutex) {
if (cfd_ != nullptr) {
cfd_->Ref();
}
@ -34,21 +75,29 @@ ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
if (cfd_ != nullptr) {
DBImpl::DeletionState deletion_state;
JobContext job_context;
mutex_->Lock();
if (cfd_->Unref()) {
delete cfd_;
}
db_->FindObsoleteFiles(deletion_state, false, true);
db_->FindObsoleteFiles(&job_context, false, true);
mutex_->Unlock();
if (deletion_state.HaveSomethingToDelete()) {
db_->PurgeObsoleteFiles(deletion_state);
if (job_context.HaveSomethingToDelete()) {
db_->PurgeObsoleteFiles(job_context);
}
}
}
uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
const std::string& ColumnFamilyHandleImpl::GetName() const {
return cfd()->GetName();
}
const Comparator* ColumnFamilyHandleImpl::user_comparator() const {
return cfd()->user_comparator();
}
ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
const ColumnFamilyOptions& src) {
ColumnFamilyOptions result = src;
@ -132,7 +181,7 @@ SuperVersion* SuperVersion::Ref() {
bool SuperVersion::Unref() {
// fetch_sub returns the previous value of ref
uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed);
uint32_t previous_refs = refs.fetch_sub(1);
assert(previous_refs > 0);
return previous_refs == 1;
}
@ -174,20 +223,22 @@ void SuperVersionUnrefHandle(void* ptr) {
}
} // anonymous namespace
ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
Version* dummy_versions, Cache* table_cache,
const ColumnFamilyOptions& options,
const DBOptions* db_options,
const EnvOptions& storage_options,
ColumnFamilySet* column_family_set)
ColumnFamilyData::ColumnFamilyData(
uint32_t id, const std::string& name, Version* _dummy_versions,
Cache* _table_cache, WriteBuffer* write_buffer,
const ColumnFamilyOptions& cf_options, const DBOptions* db_options,
const EnvOptions& env_options, ColumnFamilySet* column_family_set)
: id_(id),
name_(name),
dummy_versions_(dummy_versions),
dummy_versions_(_dummy_versions),
current_(nullptr),
refs_(0),
dropped_(false),
internal_comparator_(options.comparator),
options_(*db_options, SanitizeOptions(&internal_comparator_, options)),
internal_comparator_(cf_options.comparator),
options_(*db_options, SanitizeOptions(&internal_comparator_, cf_options)),
ioptions_(options_),
mutable_cf_options_(options_, ioptions_),
write_buffer_(write_buffer),
mem_(nullptr),
imm_(options_.min_write_buffer_number_to_merge),
super_version_(nullptr),
@ -196,48 +247,69 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
next_(nullptr),
prev_(nullptr),
log_number_(0),
need_slowdown_for_num_level0_files_(false),
column_family_set_(column_family_set) {
column_family_set_(column_family_set),
pending_flush_(false),
pending_compaction_(false) {
Ref();
// if dummy_versions is nullptr, then this is a dummy column family.
if (dummy_versions != nullptr) {
// if _dummy_versions is nullptr, then this is a dummy column family.
if (_dummy_versions != nullptr) {
internal_stats_.reset(
new InternalStats(options_.num_levels, db_options->env, this));
table_cache_.reset(new TableCache(&options_, storage_options, table_cache));
if (options_.compaction_style == kCompactionStyleUniversal) {
new InternalStats(ioptions_.num_levels, db_options->env, this));
table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache));
if (ioptions_.compaction_style == kCompactionStyleLevel) {
compaction_picker_.reset(
new UniversalCompactionPicker(&options_, &internal_comparator_));
} else if (options_.compaction_style == kCompactionStyleLevel) {
new LevelCompactionPicker(ioptions_, &internal_comparator_));
#ifndef ROCKSDB_LITE
} else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
compaction_picker_.reset(
new LevelCompactionPicker(&options_, &internal_comparator_));
new UniversalCompactionPicker(ioptions_, &internal_comparator_));
} else if (ioptions_.compaction_style == kCompactionStyleFIFO) {
compaction_picker_.reset(
new FIFOCompactionPicker(ioptions_, &internal_comparator_));
} else if (ioptions_.compaction_style == kCompactionStyleNone) {
compaction_picker_.reset(new NullCompactionPicker(
ioptions_, &internal_comparator_));
Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
"Column family %s does not use any background compaction. "
"Compactions can only be done via CompactFiles\n",
GetName().c_str());
#endif // !ROCKSDB_LITE
} else {
assert(options_.compaction_style == kCompactionStyleFIFO);
Log(InfoLogLevel::ERROR_LEVEL, ioptions_.info_log,
"Unable to recognize the specified compaction style %d. "
"Column family %s will use kCompactionStyleLevel.\n",
ioptions_.compaction_style, GetName().c_str());
compaction_picker_.reset(
new FIFOCompactionPicker(&options_, &internal_comparator_));
new LevelCompactionPicker(ioptions_, &internal_comparator_));
}
Log(options_.info_log, "Options for column family \"%s\":\n",
name.c_str());
const ColumnFamilyOptions* cf_options = &options_;
cf_options->Dump(options_.info_log.get());
if (column_family_set_->NumberOfColumnFamilies() < 10) {
Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
"--------------- Options for column family [%s]:\n", name.c_str());
options_.Dump(ioptions_.info_log);
} else {
Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
"\t(skipping printing options)\n");
}
}
RecalculateWriteStallConditions();
RecalculateWriteStallConditions(mutable_cf_options_);
}
// DB mutex held
ColumnFamilyData::~ColumnFamilyData() {
assert(refs_ == 0);
assert(refs_.load(std::memory_order_relaxed) == 0);
// remove from linked list
auto prev = prev_;
auto next = next_;
prev->next_ = next;
next->prev_ = prev;
// it's nullptr for dummy CFD
if (column_family_set_ != nullptr) {
// remove from column_family_set
if (!dropped_ && column_family_set_ != nullptr) {
// If it's dropped, it's already removed from column family set
// If column_family_set_ == nullptr, this is dummy CFD and not in
// ColumnFamilySet
column_family_set_->RemoveColumnFamily(this);
}
@ -245,6 +317,11 @@ ColumnFamilyData::~ColumnFamilyData() {
current_->Unref();
}
// It would be wrong if this ColumnFamilyData is in flush_queue_ or
// compaction_queue_ and we destroyed it
assert(!pending_flush_);
assert(!pending_compaction_);
if (super_version_ != nullptr) {
// Release SuperVersion reference kept in ThreadLocalPtr.
// This must be done outside of mutex_ since unref handler can lock mutex.
@ -262,8 +339,9 @@ ColumnFamilyData::~ColumnFamilyData() {
if (dummy_versions_ != nullptr) {
// List must be empty
assert(dummy_versions_->next_ == dummy_versions_);
delete dummy_versions_;
assert(dummy_versions_->TEST_Next() == dummy_versions_);
bool deleted __attribute__((unused)) = dummy_versions_->Unref();
assert(deleted);
}
if (mem_ != nullptr) {
@ -276,90 +354,146 @@ ColumnFamilyData::~ColumnFamilyData() {
}
}
void ColumnFamilyData::RecalculateWriteStallConditions() {
need_wait_for_num_memtables_ =
(imm()->size() == options()->max_write_buffer_number - 1);
void ColumnFamilyData::SetDropped() {
// can't drop default CF
assert(id_ != 0);
dropped_ = true;
write_controller_token_.reset();
if (current_ != nullptr) {
need_wait_for_num_level0_files_ =
(current_->NumLevelFiles(0) >= options()->level0_stop_writes_trigger);
} else {
need_wait_for_num_level0_files_ = false;
}
RecalculateWriteStallRateLimitsConditions();
// remove from column_family_set
column_family_set_->RemoveColumnFamily(this);
}
void ColumnFamilyData::RecalculateWriteStallRateLimitsConditions() {
void ColumnFamilyData::RecalculateWriteStallConditions(
const MutableCFOptions& mutable_cf_options) {
if (current_ != nullptr) {
exceeds_hard_rate_limit_ =
(options()->hard_rate_limit > 1.0 &&
current_->MaxCompactionScore() > options()->hard_rate_limit);
exceeds_soft_rate_limit_ =
(options()->soft_rate_limit > 0.0 &&
current_->MaxCompactionScore() > options()->soft_rate_limit);
} else {
exceeds_hard_rate_limit_ = false;
exceeds_soft_rate_limit_ = false;
auto* vstorage = current_->storage_info();
const double score = vstorage->max_compaction_score();
const int max_level = vstorage->max_compaction_score_level();
auto write_controller = column_family_set_->write_controller_;
if (imm()->size() >= mutable_cf_options.max_write_buffer_number) {
write_controller_token_ = write_controller->GetStopToken();
internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
"[%s] Stopping writes because we have %d immutable memtables "
"(waiting for flush), max_write_buffer_number is set to %d",
name_.c_str(), imm()->size(),
mutable_cf_options.max_write_buffer_number);
} else if (vstorage->NumLevelFiles(0) >=
mutable_cf_options.level0_stop_writes_trigger) {
write_controller_token_ = write_controller->GetStopToken();
internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
"[%s] Stopping writes because we have %d level-0 files",
name_.c_str(), vstorage->NumLevelFiles(0));
} else if (mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
vstorage->NumLevelFiles(0) >=
mutable_cf_options.level0_slowdown_writes_trigger) {
uint64_t slowdown =
SlowdownAmount(vstorage->NumLevelFiles(0),
mutable_cf_options.level0_slowdown_writes_trigger,
mutable_cf_options.level0_stop_writes_trigger);
write_controller_token_ = write_controller->GetDelayToken(slowdown);
internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown);
Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
"[%s] Stalling writes because we have %d level-0 files (%" PRIu64
"us)",
name_.c_str(), vstorage->NumLevelFiles(0), slowdown);
} else if (mutable_cf_options.hard_rate_limit > 1.0 &&
score > mutable_cf_options.hard_rate_limit) {
uint64_t kHardLimitSlowdown = 1000;
write_controller_token_ =
write_controller->GetDelayToken(kHardLimitSlowdown);
internal_stats_->RecordLevelNSlowdown(max_level, kHardLimitSlowdown,
false);
Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
"[%s] Stalling writes because we hit hard limit on level %d. "
"(%" PRIu64 "us)",
name_.c_str(), max_level, kHardLimitSlowdown);
} else if (mutable_cf_options.soft_rate_limit > 0.0 &&
score > mutable_cf_options.soft_rate_limit) {
uint64_t slowdown = SlowdownAmount(score,
mutable_cf_options.soft_rate_limit,
mutable_cf_options.hard_rate_limit);
write_controller_token_ = write_controller->GetDelayToken(slowdown);
internal_stats_->RecordLevelNSlowdown(max_level, slowdown, true);
Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
"[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64
"us)",
name_.c_str(), max_level, slowdown);
} else {
write_controller_token_.reset();
}
}
}
const EnvOptions* ColumnFamilyData::soptions() const {
return &(column_family_set_->storage_options_);
return &(column_family_set_->env_options_);
}
void ColumnFamilyData::SetCurrent(Version* current) {
current_ = current;
need_slowdown_for_num_level0_files_ =
(options_.level0_slowdown_writes_trigger >= 0 &&
current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
void ColumnFamilyData::SetCurrent(Version* current_version) {
current_ = current_version;
}
void ColumnFamilyData::CreateNewMemtable() {
MemTable* ColumnFamilyData::ConstructNewMemtable(
const MutableCFOptions& mutable_cf_options) {
assert(current_ != nullptr);
return new MemTable(internal_comparator_, ioptions_,
mutable_cf_options, write_buffer_);
}
void ColumnFamilyData::CreateNewMemtable(
const MutableCFOptions& mutable_cf_options) {
if (mem_ != nullptr) {
delete mem_->Unref();
}
mem_ = new MemTable(internal_comparator_, options_);
SetMemtable(ConstructNewMemtable(mutable_cf_options));
mem_->Ref();
}
Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
auto result = compaction_picker_->PickCompaction(current_, log_buffer);
RecalculateWriteStallRateLimitsConditions();
bool ColumnFamilyData::NeedsCompaction() const {
return compaction_picker_->NeedsCompaction(current_->storage_info());
}
Compaction* ColumnFamilyData::PickCompaction(
const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
auto* result = compaction_picker_->PickCompaction(
GetName(), mutable_options, current_->storage_info(), log_buffer);
if (result != nullptr) {
result->SetInputVersion(current_);
}
return result;
}
Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
uint32_t output_path_id,
const InternalKey* begin,
const InternalKey* end,
InternalKey** compaction_end) {
return compaction_picker_->CompactRange(current_, input_level, output_level,
output_path_id, begin, end,
compaction_end);
Compaction* ColumnFamilyData::CompactRange(
const MutableCFOptions& mutable_cf_options,
int input_level, int output_level, uint32_t output_path_id,
const InternalKey* begin, const InternalKey* end,
InternalKey** compaction_end) {
auto* result = compaction_picker_->CompactRange(
GetName(), mutable_cf_options, current_->storage_info(), input_level,
output_level, output_path_id, begin, end, compaction_end);
if (result != nullptr) {
result->SetInputVersion(current_);
}
return result;
}
SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
port::Mutex* db_mutex) {
InstrumentedMutex* db_mutex) {
SuperVersion* sv = nullptr;
if (LIKELY(column_family_set_->db_options_->allow_thread_local)) {
sv = GetThreadLocalSuperVersion(db_mutex);
sv->Ref();
if (!ReturnThreadLocalSuperVersion(sv)) {
sv->Unref();
}
} else {
db_mutex->Lock();
sv = super_version_->Ref();
db_mutex->Unlock();
sv = GetThreadLocalSuperVersion(db_mutex);
sv->Ref();
if (!ReturnThreadLocalSuperVersion(sv)) {
sv->Unref();
}
return sv;
}
SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
port::Mutex* db_mutex) {
InstrumentedMutex* db_mutex) {
SuperVersion* sv = nullptr;
// The SuperVersion is cached in thread local storage to avoid acquiring
// mutex when SuperVersion does not change since the last use. When a new
@ -382,11 +516,11 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
sv = static_cast<SuperVersion*>(ptr);
if (sv == SuperVersion::kSVObsolete ||
sv->version_number != super_version_number_.load()) {
RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES);
RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES);
SuperVersion* sv_to_delete = nullptr;
if (sv && sv->Unref()) {
RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS);
RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS);
db_mutex->Lock();
// NOTE: underlying resources held by superversion (sst files) might
// not be released until the next background job.
@ -422,20 +556,68 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
return false;
}
void ColumnFamilyData::NotifyOnCompactionCompleted(
DB* db, Compaction* c, const Status& status) {
#ifndef ROCKSDB_LITE
auto listeners = ioptions()->listeners;
CompactionJobInfo info;
info.cf_name = c->column_family_data()->GetName();
info.status = status;
info.output_level = c->output_level();
for (const auto fmd : *c->inputs(c->level())) {
info.input_files.push_back(
TableFileName(options_.db_paths,
fmd->fd.GetNumber(),
fmd->fd.GetPathId()));
}
for (const auto newf : c->edit()->GetNewFiles()) {
info.input_files.push_back(
TableFileName(options_.db_paths,
newf.second.fd.GetNumber(),
newf.second.fd.GetPathId()));
}
for (auto listener : listeners) {
listener->OnCompactionCompleted(db, info);
}
#endif // ROCKSDB_LITE
}
void ColumnFamilyData::NotifyOnFlushCompleted(
DB* db, const std::string& file_path,
bool triggered_flush_slowdown,
bool triggered_flush_stop) {
#ifndef ROCKSDB_LITE
auto listeners = ioptions()->listeners;
for (auto listener : listeners) {
listener->OnFlushCompleted(
db, GetName(), file_path,
// Use path 0 as fulled memtables are first flushed into path 0.
triggered_flush_slowdown, triggered_flush_stop);
}
#endif // ROCKSDB_LITE
}
SuperVersion* ColumnFamilyData::InstallSuperVersion(
SuperVersion* new_superversion, InstrumentedMutex* db_mutex) {
db_mutex->AssertHeld();
return InstallSuperVersion(new_superversion, db_mutex, mutable_cf_options_);
}
SuperVersion* ColumnFamilyData::InstallSuperVersion(
SuperVersion* new_superversion, port::Mutex* db_mutex) {
SuperVersion* new_superversion, InstrumentedMutex* db_mutex,
const MutableCFOptions& mutable_cf_options) {
new_superversion->db_mutex = db_mutex;
new_superversion->mutable_cf_options = mutable_cf_options;
new_superversion->Init(mem_, imm_.current(), current_);
SuperVersion* old_superversion = super_version_;
super_version_ = new_superversion;
++super_version_number_;
super_version_->version_number = super_version_number_;
// Reset SuperVersions cached in thread local storage
if (column_family_set_->db_options_->allow_thread_local) {
ResetThreadLocalSuperVersions();
}
ResetThreadLocalSuperVersions();
RecalculateWriteStallConditions();
RecalculateWriteStallConditions(mutable_cf_options);
if (old_superversion != nullptr && old_superversion->Unref()) {
old_superversion->Cleanup();
@ -460,20 +642,37 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
}
}
#ifndef ROCKSDB_LITE
Status ColumnFamilyData::SetOptions(
const std::unordered_map<std::string, std::string>& options_map) {
MutableCFOptions new_mutable_cf_options;
Status s = GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
&new_mutable_cf_options);
if (s.ok()) {
mutable_cf_options_ = new_mutable_cf_options;
mutable_cf_options_.RefreshDerivedOptions(ioptions_);
}
return s;
}
#endif // ROCKSDB_LITE
ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
const DBOptions* db_options,
const EnvOptions& storage_options,
Cache* table_cache)
const EnvOptions& env_options,
Cache* table_cache,
WriteBuffer* write_buffer,
WriteController* write_controller)
: max_column_family_(0),
dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr,
dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, nullptr,
ColumnFamilyOptions(), db_options,
storage_options_, nullptr)),
env_options, nullptr)),
default_cfd_cache_(nullptr),
db_name_(dbname),
db_options_(db_options),
storage_options_(storage_options),
env_options_(env_options),
table_cache_(table_cache),
spin_lock_(ATOMIC_FLAG_INIT) {
write_buffer_(write_buffer),
write_controller_(write_controller) {
// initialize linked list
dummy_cfd_->prev_ = dummy_cfd_;
dummy_cfd_->next_ = dummy_cfd_;
@ -530,18 +729,17 @@ size_t ColumnFamilySet::NumberOfColumnFamilies() const {
return column_families_.size();
}
// under a DB mutex
// under a DB mutex AND write thread
ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
const std::string& name, uint32_t id, Version* dummy_versions,
const ColumnFamilyOptions& options) {
assert(column_families_.find(name) == column_families_.end());
ColumnFamilyData* new_cfd =
new ColumnFamilyData(id, name, dummy_versions, table_cache_, options,
db_options_, storage_options_, this);
Lock();
new ColumnFamilyData(id, name, dummy_versions, table_cache_,
write_buffer_, options, db_options_,
env_options_, this);
column_families_.insert({name, id});
column_family_data_.insert({id, new_cfd});
Unlock();
max_column_family_ = std::max(max_column_family_, id);
// add to linked list
new_cfd->next_ = dummy_cfd_;
@ -555,19 +753,11 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
return new_cfd;
}
void ColumnFamilySet::Lock() {
// spin lock
while (spin_lock_.test_and_set(std::memory_order_acquire)) {
}
}
void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); }
// REQUIRES: DB mutex held
void ColumnFamilySet::FreeDeadColumnFamilies() {
autovector<ColumnFamilyData*> to_delete;
for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
if (cfd->refs_ == 0) {
if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
to_delete.push_back(cfd);
}
}
@ -577,25 +767,21 @@ void ColumnFamilySet::FreeDeadColumnFamilies() {
}
}
// under a DB mutex
// under a DB mutex AND from a write thread
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
auto cfd_iter = column_family_data_.find(cfd->GetID());
assert(cfd_iter != column_family_data_.end());
Lock();
column_family_data_.erase(cfd_iter);
column_families_.erase(cfd->GetName());
Unlock();
}
// under a DB mutex OR from a write thread
bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
if (column_family_id == 0) {
// optimization for common case
current_ = column_family_set_->GetDefault();
} else {
// maybe outside of db mutex, should lock
column_family_set_->Lock();
current_ = column_family_set_->GetColumnFamily(column_family_id);
column_family_set_->Unlock();
}
handle_.SetCFD(current_);
return current_ != nullptr;
@ -611,16 +797,18 @@ MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
return current_->mem();
}
const Options* ColumnFamilyMemTablesImpl::GetOptions() const {
assert(current_ != nullptr);
return current_->options();
}
ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
assert(current_ != nullptr);
return &handle_;
}
void ColumnFamilyMemTablesImpl::CheckMemtableFull() {
if (current_ != nullptr && current_->mem()->ShouldScheduleFlush()) {
flush_scheduler_->ScheduleFlush(current_);
current_->mem()->MarkFlushScheduled();
}
}
uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
uint32_t column_family_id = 0;
if (column_family != nullptr) {
@ -630,4 +818,13 @@ uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
return column_family_id;
}
const Comparator* GetColumnFamilyUserComparator(
ColumnFamilyHandle* column_family) {
if (column_family != nullptr) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
return cfh->user_comparator();
}
return nullptr;
}
} // namespace rocksdb

@ -19,7 +19,11 @@
#include "rocksdb/env.h"
#include "db/memtable_list.h"
#include "db/write_batch_internal.h"
#include "db/write_controller.h"
#include "db/table_cache.h"
#include "db/flush_scheduler.h"
#include "util/instrumented_mutex.h"
#include "util/mutable_cf_options.h"
#include "util/thread_local.h"
namespace rocksdb {
@ -35,6 +39,8 @@ class InternalStats;
class ColumnFamilyData;
class DBImpl;
class LogBuffer;
class InstrumentedMutex;
class InstrumentedMutexLock;
// ColumnFamilyHandleImpl is the class that clients use to access different
// column families. It has non-trivial destructor, which gets called when client
@ -42,17 +48,20 @@ class LogBuffer;
class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
public:
// create while holding the mutex
ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex);
ColumnFamilyHandleImpl(
ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex);
// destroy without mutex
virtual ~ColumnFamilyHandleImpl();
virtual ColumnFamilyData* cfd() const { return cfd_; }
virtual const Comparator* user_comparator() const;
virtual uint32_t GetID() const;
virtual const std::string& GetName() const override;
private:
ColumnFamilyData* cfd_;
DBImpl* db_;
port::Mutex* mutex_;
InstrumentedMutex* mutex_;
};
// Does not ref-count ColumnFamilyData
@ -66,7 +75,7 @@ class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
ColumnFamilyHandleInternal()
: ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {}
void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; }
void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
private:
@ -78,6 +87,7 @@ struct SuperVersion {
MemTable* mem;
MemTableListVersion* imm;
Version* current;
MutableCFOptions mutable_cf_options;
std::atomic<uint32_t> refs;
// We need to_delete because during Cleanup(), imm->Unref() returns
// all memtables that we need to free through this vector. We then
@ -85,7 +95,7 @@ struct SuperVersion {
autovector<MemTable*> to_delete;
// Version number of the current SuperVersion
uint64_t version_number;
port::Mutex* db_mutex;
InstrumentedMutex* db_mutex;
// should be called outside the mutex
SuperVersion() = default;
@ -117,8 +127,7 @@ extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
class ColumnFamilySet;
// This class keeps all the data that a column family needs. It's mosly dumb and
// used just to provide access to metadata.
// This class keeps all the data that a column family needs.
// Most methods require DB mutex held, unless otherwise noted
class ColumnFamilyData {
public:
@ -129,17 +138,24 @@ class ColumnFamilyData {
// thread-safe
const std::string& GetName() const { return name_; }
void Ref() { ++refs_; }
// Ref() can only be called whily holding a DB mutex or during a
// single-threaded write.
void Ref() { refs_.fetch_add(1, std::memory_order_relaxed); }
// will just decrease reference count to 0, but will not delete it. returns
// true if the ref count was decreased to zero. in that case, it can be
// deleted by the caller immediatelly, or later, by calling
// deleted by the caller immediately, or later, by calling
// FreeDeadColumnFamilies()
// Unref() can only be called while holding a DB mutex
bool Unref() {
assert(refs_ > 0);
return --refs_ == 0;
int old_refs = refs_.fetch_sub(1, std::memory_order_relaxed);
assert(old_refs > 0);
return old_refs == 1;
}
// This can only be called from single-threaded VersionSet::LogAndApply()
// SetDropped() can only be called under following conditions:
// 1) Holding a DB mutex,
// 2) from single-threaded write thread, AND
// 3) from single-threaded VersionSet::LogAndApply()
// After dropping column family no other operation on that column family
// will be executed. All the files and memory will be, however, kept around
// until client drops the column family handle. That way, client can still
@ -147,27 +163,42 @@ class ColumnFamilyData {
// Column family can be dropped and still alive. In that state:
// *) Column family is not included in the iteration.
// *) Compaction and flush is not executed on the dropped column family.
// *) Client can continue writing and reading from column family. However, all
// writes stay in the current memtable.
// *) Client can continue reading from column family. Writes will fail unless
// WriteOptions::ignore_missing_column_families is true
// When the dropped column family is unreferenced, then we:
// *) delete all memory associated with that column family
// *) delete all the files associated with that column family
void SetDropped() {
// can't drop default CF
assert(id_ != 0);
dropped_ = true;
}
void SetDropped();
bool IsDropped() const { return dropped_; }
// thread-safe
int NumberLevels() const { return options_.num_levels; }
int NumberLevels() const { return ioptions_.num_levels; }
void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
uint64_t GetLogNumber() const { return log_number_; }
// thread-safe
// !!! To be deprecated! Please don't not use this function anymore!
const Options* options() const { return &options_; }
// thread-safe
const EnvOptions* soptions() const;
const ImmutableCFOptions* ioptions() const { return &ioptions_; }
// REQUIRES: DB mutex held
// This returns the MutableCFOptions used by current SuperVersion
// You shoul use this API to reference MutableCFOptions most of the time.
const MutableCFOptions* GetCurrentMutableCFOptions() const {
return &(super_version_->mutable_cf_options);
}
// REQUIRES: DB mutex held
// This returns the latest MutableCFOptions, which may be not in effect yet.
const MutableCFOptions* GetLatestMutableCFOptions() const {
return &mutable_cf_options_;
}
#ifndef ROCKSDB_LITE
// REQUIRES: DB mutex held
Status SetOptions(
const std::unordered_map<std::string, std::string>& options_map);
#endif // ROCKSDB_LITE
InternalStats* internal_stats() { return internal_stats_.get(); }
@ -175,18 +206,25 @@ class ColumnFamilyData {
MemTable* mem() { return mem_; }
Version* current() { return current_; }
Version* dummy_versions() { return dummy_versions_; }
void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
void SetCurrent(Version* current);
void CreateNewMemtable();
MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options);
void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
void CreateNewMemtable(const MutableCFOptions& mutable_cf_options);
TableCache* table_cache() const { return table_cache_.get(); }
// See documentation in compaction_picker.h
Compaction* PickCompaction(LogBuffer* log_buffer);
Compaction* CompactRange(int input_level, int output_level,
uint32_t output_path_id, const InternalKey* begin,
const InternalKey* end,
InternalKey** compaction_end);
// REQUIRES: DB mutex held
bool NeedsCompaction() const;
// REQUIRES: DB mutex held
Compaction* PickCompaction(const MutableCFOptions& mutable_options,
LogBuffer* log_buffer);
// REQUIRES: DB mutex held
Compaction* CompactRange(
const MutableCFOptions& mutable_cf_options,
int input_level, int output_level, uint32_t output_path_id,
const InternalKey* begin, const InternalKey* end,
InternalKey** compaction_end);
CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
// thread-safe
@ -201,11 +239,11 @@ class ColumnFamilyData {
SuperVersion* GetSuperVersion() { return super_version_; }
// thread-safe
// Return a already referenced SuperVersion to be used safely.
SuperVersion* GetReferencedSuperVersion(port::Mutex* db_mutex);
SuperVersion* GetReferencedSuperVersion(InstrumentedMutex* db_mutex);
// thread-safe
// Get SuperVersion stored in thread local storage. If it does not exist,
// get a reference from a current SuperVersion.
SuperVersion* GetThreadLocalSuperVersion(port::Mutex* db_mutex);
SuperVersion* GetThreadLocalSuperVersion(InstrumentedMutex* db_mutex);
// Try to return SuperVersion back to thread local storage. Retrun true on
// success and false on failure. It fails when the thread local storage
// contains anything other than SuperVersion::kSVInUse flag.
@ -218,40 +256,35 @@ class ColumnFamilyData {
// if its reference count is zero and needs deletion or nullptr if not
// As argument takes a pointer to allocated SuperVersion to enable
// the clients to allocate SuperVersion outside of mutex.
// IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
port::Mutex* db_mutex);
InstrumentedMutex* db_mutex,
const MutableCFOptions& mutable_cf_options);
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
InstrumentedMutex* db_mutex);
void ResetThreadLocalSuperVersions();
// A Flag indicating whether write needs to slowdown because of there are
// too many number of level0 files.
bool NeedSlowdownForNumLevel0Files() const {
return need_slowdown_for_num_level0_files_;
}
bool NeedWaitForNumLevel0Files() const {
return need_wait_for_num_level0_files_;
}
bool NeedWaitForNumMemtables() const {
return need_wait_for_num_memtables_;
}
void NotifyOnCompactionCompleted(DB* db, Compaction* c, const Status& status);
bool ExceedsSoftRateLimit() const {
return exceeds_soft_rate_limit_;
}
void NotifyOnFlushCompleted(
DB* db, const std::string& file_path,
bool triggered_flush_slowdown,
bool triggered_flush_stop);
bool ExceedsHardRateLimit() const {
return exceeds_hard_rate_limit_;
}
// Protected by DB mutex
void set_pending_flush(bool value) { pending_flush_ = value; }
void set_pending_compaction(bool value) { pending_compaction_ = value; }
bool pending_flush() { return pending_flush_; }
bool pending_compaction() { return pending_compaction_; }
private:
friend class ColumnFamilySet;
ColumnFamilyData(uint32_t id, const std::string& name,
Version* dummy_versions, Cache* table_cache,
WriteBuffer* write_buffer,
const ColumnFamilyOptions& options,
const DBOptions* db_options,
const EnvOptions& storage_options,
const DBOptions* db_options, const EnvOptions& env_options,
ColumnFamilySet* column_family_set);
// Recalculate some small conditions, which are changed only during
@ -259,25 +292,29 @@ class ColumnFamilyData {
// recalculation of compaction score. These values are used in
// DBImpl::MakeRoomForWrite function to decide, if it need to make
// a write stall
void RecalculateWriteStallConditions();
void RecalculateWriteStallRateLimitsConditions();
void RecalculateWriteStallConditions(
const MutableCFOptions& mutable_cf_options);
uint32_t id_;
const std::string name_;
Version* dummy_versions_; // Head of circular doubly-linked list of versions.
Version* current_; // == dummy_versions->prev_
int refs_; // outstanding references to ColumnFamilyData
std::atomic<int> refs_; // outstanding references to ColumnFamilyData
bool dropped_; // true if client dropped it
const InternalKeyComparator internal_comparator_;
Options const options_;
const Options options_;
const ImmutableCFOptions ioptions_;
MutableCFOptions mutable_cf_options_;
std::unique_ptr<TableCache> table_cache_;
std::unique_ptr<InternalStats> internal_stats_;
WriteBuffer* write_buffer_;
MemTable* mem_;
MemTableList imm_;
SuperVersion* super_version_;
@ -301,46 +338,38 @@ class ColumnFamilyData {
// recovered from
uint64_t log_number_;
// A flag indicating whether we should delay writes because
// we have too many level 0 files
bool need_slowdown_for_num_level0_files_;
// These 4 variables are updated only after compaction,
// adding new memtable, flushing memtables to files
// and/or add recalculation of compaction score.
// That's why theirs values are cached in ColumnFamilyData.
// Recalculation is made by RecalculateWriteStallConditions and
// RecalculateWriteStallRateLimitsConditions function. They are used
// in DBImpl::MakeRoomForWrite function to decide, if it need
// to sleep during write operation
bool need_wait_for_num_memtables_;
bool need_wait_for_num_level0_files_;
bool exceeds_hard_rate_limit_;
bool exceeds_soft_rate_limit_;
// An object that keeps all the compaction stats
// and picks the next compaction
std::unique_ptr<CompactionPicker> compaction_picker_;
ColumnFamilySet* column_family_set_;
std::unique_ptr<WriteControllerToken> write_controller_token_;
// If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
bool pending_flush_;
// If true --> this ColumnFamily is currently present in
// DBImpl::compaction_queue_
bool pending_compaction_;
};
// ColumnFamilySet has interesting thread-safety requirements
// * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB
// mutex. Inside, column_family_data_ and column_families_ will be protected
// by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from
// VersionSet::LogAndApply() in the normal runtime. It is also called
// during Recovery and in DumpManifest(). RemoveColumnFamily() is called
// from ColumnFamilyData destructor
// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
// mutex AND executed in the write thread.
// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
// single-threaded write thread. It is also called during Recovery and in
// DumpManifest().
// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
// held and it needs to be executed from the write thread. SetDropped() also
// guarantees that it will be called only from single-threaded LogAndApply(),
// but this condition is not that important.
// * Iteration -- hold DB mutex, but you can release it in the body of
// iteration. If you release DB mutex in body, reference the column
// family before the mutex and unreference after you unlock, since the column
// family might get dropped when the DB mutex is released
// * GetDefault() -- thread safe
// * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock()
// * GetColumnFamily() -- either inside of DB mutex or from a write thread
// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
// NumberOfColumnFamilies -- inside of DB mutex
class ColumnFamilySet {
@ -354,7 +383,8 @@ class ColumnFamilySet {
// dummy is never dead or dropped, so this will never be infinite
do {
current_ = current_->next_;
} while (current_->refs_ == 0 || current_->IsDropped());
} while (current_->refs_.load(std::memory_order_relaxed) == 0 ||
current_->IsDropped());
return *this;
}
bool operator!=(const iterator& other) {
@ -367,7 +397,8 @@ class ColumnFamilySet {
};
ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
const EnvOptions& storage_options, Cache* table_cache);
const EnvOptions& env_options, Cache* table_cache,
WriteBuffer* write_buffer, WriteController* write_controller);
~ColumnFamilySet();
ColumnFamilyData* GetDefault() const;
@ -390,9 +421,6 @@ class ColumnFamilySet {
iterator begin() { return iterator(dummy_cfd_->next_); }
iterator end() { return iterator(dummy_cfd_); }
void Lock();
void Unlock();
// REQUIRES: DB mutex held
// Don't call while iterating over ColumnFamilySet
void FreeDeadColumnFamilies();
@ -404,9 +432,12 @@ class ColumnFamilySet {
void RemoveColumnFamily(ColumnFamilyData* cfd);
// column_families_ and column_family_data_ need to be protected:
// * when mutating: 1. DB mutex locked first, 2. spinlock locked second
// * when reading, either: 1. lock DB mutex, or 2. lock spinlock
// (if both, respect the ordering to avoid deadlock!)
// * when mutating both conditions have to be satisfied:
// 1. DB mutex locked
// 2. thread currently in single-threaded write thread
// * when reading, at least one condition needs to be satisfied:
// 1. DB mutex locked
// 2. accessed from a single-threaded write thread
std::unordered_map<std::string, uint32_t> column_families_;
std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
@ -420,41 +451,52 @@ class ColumnFamilySet {
const std::string db_name_;
const DBOptions* const db_options_;
const EnvOptions storage_options_;
const EnvOptions env_options_;
Cache* table_cache_;
std::atomic_flag spin_lock_;
WriteBuffer* write_buffer_;
WriteController* write_controller_;
};
// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
// memtables of different column families (specified by ID in the write batch)
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
public:
explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
: column_family_set_(column_family_set), current_(nullptr) {}
explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set,
FlushScheduler* flush_scheduler)
: column_family_set_(column_family_set),
current_(nullptr),
flush_scheduler_(flush_scheduler) {}
// sets current_ to ColumnFamilyData with column_family_id
// returns false if column family doesn't exist
// REQUIRES: under a DB mutex OR from a write thread
bool Seek(uint32_t column_family_id) override;
// Returns log number of the selected column family
// REQUIRES: under a DB mutex OR from a write thread
uint64_t GetLogNumber() const override;
// REQUIRES: Seek() called first
// REQUIRES: under a DB mutex OR from a write thread
virtual MemTable* GetMemTable() const override;
// Returns options for selected column family
// REQUIRES: Seek() called first
virtual const Options* GetOptions() const override;
// Returns column family handle for the selected column family
// REQUIRES: under a DB mutex OR from a write thread
virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
// REQUIRES: under a DB mutex OR from a write thread
virtual void CheckMemtableFull() override;
private:
ColumnFamilySet* column_family_set_;
ColumnFamilyData* current_;
FlushScheduler* flush_scheduler_;
ColumnFamilyHandleInternal handle_;
};
extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
extern const Comparator* GetColumnFamilyUserComparator(
ColumnFamilyHandle* column_family);
} // namespace rocksdb

@ -133,7 +133,7 @@ class ColumnFamilyTest {
void CreateColumnFamilies(
const std::vector<std::string>& cfs,
const std::vector<ColumnFamilyOptions> options = {}) {
int cfi = handles_.size();
int cfi = static_cast<int>(handles_.size());
handles_.resize(cfi + cfs.size());
names_.resize(cfi + cfs.size());
for (size_t i = 0; i < cfs.size(); ++i) {
@ -218,7 +218,7 @@ class ColumnFamilyTest {
int NumTableFilesAtLevel(int level, int cf) {
return GetProperty(cf,
"rocksdb.num-files-at-level" + std::to_string(level));
"rocksdb.num-files-at-level" + ToString(level));
}
// Return spread of files per level
@ -231,7 +231,7 @@ class ColumnFamilyTest {
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
result += buf;
if (f > 0) {
last_non_zero_offset = result.size();
last_non_zero_offset = static_cast<int>(result.size());
}
}
result.resize(last_non_zero_offset);
@ -287,8 +287,8 @@ class ColumnFamilyTest {
assert(num_per_cf.size() == handles_.size());
for (size_t i = 0; i < num_per_cf.size(); ++i) {
ASSERT_EQ(num_per_cf[i],
GetProperty(i, "rocksdb.num-immutable-mem-table"));
ASSERT_EQ(num_per_cf[i], GetProperty(static_cast<int>(i),
"rocksdb.num-immutable-mem-table"));
}
}
@ -387,7 +387,7 @@ TEST(ColumnFamilyTest, DropTest) {
Open({"default"});
CreateColumnFamiliesAndReopen({"pikachu"});
for (int i = 0; i < 100; ++i) {
ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
ASSERT_OK(Put(1, ToString(i), "bar" + ToString(i)));
}
ASSERT_OK(Flush(1));
@ -408,9 +408,15 @@ TEST(ColumnFamilyTest, WriteBatchFailure) {
Open();
CreateColumnFamiliesAndReopen({"one", "two"});
WriteBatch batch;
batch.Put(handles_[0], Slice("existing"), Slice("column-family"));
batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
ASSERT_OK(db_->Write(WriteOptions(), &batch));
DropColumnFamilies({1});
WriteOptions woptions_ignore_missing_cf;
woptions_ignore_missing_cf.ignore_missing_column_families = true;
batch.Put(handles_[0], Slice("still here"), Slice("column-family"));
ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch));
ASSERT_EQ("column-family", Get(0, "still here"));
Status s = db_->Write(WriteOptions(), &batch);
ASSERT_TRUE(s.IsInvalidArgument());
Close();
@ -523,8 +529,28 @@ TEST(ColumnFamilyTest, FlushTest) {
ASSERT_OK(Put(1, "mirko", "v3"));
ASSERT_OK(Put(0, "foo", "v2"));
ASSERT_OK(Put(2, "fodor", "v5"));
for (int i = 0; i < 3; ++i) {
Flush(i);
for (int j = 0; j < 2; j++) {
ReadOptions ro;
std::vector<Iterator*> iterators;
// Hold super version.
if (j == 0) {
ASSERT_OK(db_->NewIterators(ro, handles_, &iterators));
}
for (int i = 0; i < 3; ++i) {
uint64_t max_total_in_memory_state =
dbfull()->TEST_max_total_in_memory_state();
Flush(i);
ASSERT_EQ(dbfull()->TEST_max_total_in_memory_state(),
max_total_in_memory_state);
}
ASSERT_OK(Put(1, "foofoo", "bar"));
ASSERT_OK(Put(0, "foofoo", "bar"));
for (auto* it : iterators) {
delete it;
}
}
Reopen();
@ -705,6 +731,27 @@ TEST(ColumnFamilyTest, DifferentWriteBufferSizes) {
Close();
}
TEST(ColumnFamilyTest, MemtableNotSupportSnapshot) {
Open();
auto* s1 = dbfull()->GetSnapshot();
ASSERT_TRUE(s1 != nullptr);
dbfull()->ReleaseSnapshot(s1);
// Add a column family that doesn't support snapshot
ColumnFamilyOptions first;
first.memtable_factory.reset(NewHashCuckooRepFactory(1024 * 1024));
CreateColumnFamilies({"first"}, {first});
auto* s2 = dbfull()->GetSnapshot();
ASSERT_TRUE(s2 == nullptr);
// Add a column family that supports snapshot. Snapshot stays not supported.
ColumnFamilyOptions second;
CreateColumnFamilies({"second"}, {second});
auto* s3 = dbfull()->GetSnapshot();
ASSERT_TRUE(s3 == nullptr);
Close();
}
TEST(ColumnFamilyTest, DifferentMergeOperators) {
Open();
CreateColumnFamilies({"first", "second"});
@ -768,14 +815,14 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) {
for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
PutRandomData(1, 11, 10000);
WaitForFlush(1);
ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1));
ASSERT_EQ(ToString(i + 1), FilesPerLevel(1));
}
// SETUP column family "two" -- level style with 4 levels
for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
PutRandomData(2, 15, 10000);
WaitForFlush(2);
ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2));
ASSERT_EQ(ToString(i + 1), FilesPerLevel(2));
}
// TRIGGER compaction "one"
@ -910,11 +957,11 @@ TEST(ColumnFamilyTest, DontRollEmptyLogs) {
CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
for (size_t i = 0; i < handles_.size(); ++i) {
PutRandomData(i, 10, 100);
PutRandomData(static_cast<int>(i), 10, 100);
}
int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls();
// this will trigger the flushes
for (size_t i = 0; i <= 4; ++i) {
for (int i = 0; i <= 4; ++i) {
ASSERT_OK(Flush(i));
}

@ -9,7 +9,10 @@
#include "db/compaction.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <vector>
@ -26,7 +29,17 @@ uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
return sum;
}
Compaction::Compaction(Version* input_version, int start_level, int out_level,
void Compaction::SetInputVersion(Version* _input_version) {
input_version_ = _input_version;
cfd_ = input_version_->cfd();
cfd_->Ref();
input_version_->Ref();
edit_ = new VersionEdit();
edit_->SetColumnFamily(cfd_->GetID());
}
Compaction::Compaction(int number_levels, int start_level, int out_level,
uint64_t target_file_size,
uint64_t max_grandparent_overlap_bytes,
uint32_t output_path_id,
@ -36,9 +49,10 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level,
output_level_(out_level),
max_output_file_size_(target_file_size),
max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
input_version_(input_version),
number_levels_(input_version_->NumberLevels()),
cfd_(input_version_->cfd_),
input_version_(nullptr),
edit_(nullptr),
number_levels_(number_levels),
cfd_(nullptr),
output_path_id_(output_path_id),
output_compression_(output_compression),
seek_compaction_(seek_compaction),
@ -53,11 +67,6 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level,
is_full_compaction_(false),
is_manual_compaction_(false),
level_ptrs_(std::vector<size_t>(number_levels_)) {
cfd_->Ref();
input_version_->Ref();
edit_ = new VersionEdit();
edit_->SetColumnFamily(cfd_->GetID());
for (int i = 0; i < number_levels_; i++) {
level_ptrs_[i] = 0;
}
@ -69,6 +78,38 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level,
}
}
Compaction::Compaction(VersionStorageInfo* vstorage,
const autovector<CompactionInputFiles>& _inputs,
int _start_level, int _output_level,
uint64_t _max_grandparent_overlap_bytes,
const CompactionOptions& _options,
bool _deletion_compaction)
: start_level_(_start_level),
output_level_(_output_level),
max_output_file_size_(_options.output_file_size_limit),
max_grandparent_overlap_bytes_(_max_grandparent_overlap_bytes),
input_version_(nullptr),
number_levels_(vstorage->num_levels()),
cfd_(nullptr),
output_compression_(_options.compression),
seek_compaction_(false),
deletion_compaction_(_deletion_compaction),
inputs_(_inputs),
grandparent_index_(0),
seen_key_(false),
overlapped_bytes_(0),
base_index_(-1),
parent_index_(-1),
score_(0),
bottommost_level_(false),
is_full_compaction_(false),
is_manual_compaction_(false),
level_ptrs_(std::vector<size_t>(number_levels_)) {
for (int i = 0; i < number_levels_; i++) {
level_ptrs_[i] = 0;
}
}
Compaction::~Compaction() {
delete edit_;
if (input_version_ != nullptr) {
@ -83,8 +124,9 @@ Compaction::~Compaction() {
void Compaction::GenerateFileLevels() {
input_levels_.resize(num_input_levels());
for (int which = 0; which < num_input_levels(); which++) {
DoGenerateFileLevel(&input_levels_[which], inputs_[which].files, &arena_);
for (size_t which = 0; which < num_input_levels(); which++) {
DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
&arena_);
}
}
@ -98,26 +140,29 @@ bool Compaction::IsTrivialMove() const {
num_input_levels() == 2 &&
num_input_files(0) == 1 &&
num_input_files(1) == 0 &&
input(0, 0)->fd.GetPathId() == GetOutputPathId() &&
TotalFileSize(grandparents_) <= max_grandparent_overlap_bytes_);
}
void Compaction::AddInputDeletions(VersionEdit* edit) {
for (int which = 0; which < num_input_levels(); which++) {
void Compaction::AddInputDeletions(VersionEdit* out_edit) {
for (size_t which = 0; which < num_input_levels(); which++) {
for (size_t i = 0; i < inputs_[which].size(); i++) {
edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
}
}
}
bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) {
assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
assert(input_version_ != nullptr);
assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO);
if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
return bottommost_level_;
}
// Maybe use binary search to find right entry instead of linear search?
const Comparator* user_cmp = cfd_->user_comparator();
for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
const std::vector<FileMetaData*>& files =
input_version_->storage_info()->LevelFiles(lvl);
for (; level_ptrs_[lvl] < files.size(); ) {
FileMetaData* f = files[level_ptrs_[lvl]];
if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
@ -163,7 +208,7 @@ bool Compaction::ShouldStopBefore(const Slice& internal_key) {
// Mark (or clear) each file that is being compacted
void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
for (int i = 0; i < num_input_levels(); i++) {
for (size_t i = 0; i < num_input_levels(); i++) {
for (unsigned int j = 0; j < inputs_[i].size(); j++) {
assert(mark_as_compacted ? !inputs_[i][j]->being_compacted :
inputs_[i][j]->being_compacted);
@ -173,9 +218,9 @@ void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
}
// Is this compaction producing files at the bottommost level?
void Compaction::SetupBottomMostLevel(bool is_manual) {
assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
void Compaction::SetupBottomMostLevel(VersionStorageInfo* vstorage,
bool is_manual, bool level0_only) {
if (level0_only) {
// If universal compaction style is used and manual
// compaction is occuring, then we are guaranteed that
// all files will be picked in a single compaction
@ -190,32 +235,20 @@ void Compaction::SetupBottomMostLevel(bool is_manual) {
bottommost_level_ = true;
// checks whether there are files living beyond the output_level.
for (int i = output_level_ + 1; i < number_levels_; i++) {
if (input_version_->NumLevelFiles(i) > 0) {
if (vstorage->NumLevelFiles(i) > 0) {
bottommost_level_ = false;
break;
}
}
}
void Compaction::ReleaseInputs() {
if (input_version_ != nullptr) {
input_version_->Unref();
input_version_ = nullptr;
}
if (cfd_ != nullptr) {
if (cfd_->Unref()) {
delete cfd_;
}
cfd_ = nullptr;
}
}
void Compaction::ReleaseCompactionFiles(Status status) {
cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
}
void Compaction::ResetNextCompactionIndex() {
input_version_->ResetNextCompactionIndex(start_level_);
assert(input_version_ != nullptr);
input_version_->storage_info()->ResetNextCompactionIndex(start_level_);
}
namespace {
@ -248,14 +281,15 @@ void Compaction::Summary(char* output, int len) {
return;
}
for (int level = 0; level < num_input_levels(); ++level) {
if (level > 0) {
for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
if (level_iter > 0) {
write += snprintf(output + write, len - write, "], [");
if (write < 0 || write >= len) {
return;
}
}
write += InputSummary(inputs_[level].files, output + write, len - write);
write +=
InputSummary(inputs_[level_iter].files, output + write, len - write);
if (write < 0 || write >= len) {
return;
}
@ -264,15 +298,15 @@ void Compaction::Summary(char* output, int len) {
snprintf(output + write, len - write, "]");
}
uint64_t Compaction::OutputFilePreallocationSize() {
uint64_t Compaction::OutputFilePreallocationSize(
const MutableCFOptions& mutable_options) {
uint64_t preallocation_size = 0;
if (cfd_->options()->compaction_style == kCompactionStyleLevel) {
preallocation_size =
cfd_->compaction_picker()->MaxFileSizeForLevel(output_level());
if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
preallocation_size = mutable_options.MaxFileSizeForLevel(output_level());
} else {
for (int level = 0; level < num_input_levels(); ++level) {
for (const auto& f : inputs_[level].files) {
for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
for (const auto& f : inputs_[level_iter].files) {
preallocation_size += f->fd.GetFileSize();
}
}
@ -282,4 +316,15 @@ uint64_t Compaction::OutputFilePreallocationSize() {
return preallocation_size * 1.1;
}
Compaction* Compaction::TEST_NewCompaction(
int num_levels, int start_level, int out_level, uint64_t target_file_size,
uint64_t max_grandparent_overlap_bytes, uint32_t output_path_id,
CompressionType output_compression, bool seek_compaction,
bool deletion_compaction) {
return new Compaction(num_levels, start_level, out_level, target_file_size,
max_grandparent_overlap_bytes, output_path_id,
output_compression, seek_compaction,
deletion_compaction);
}
} // namespace rocksdb

@ -10,6 +10,7 @@
#pragma once
#include "util/arena.h"
#include "util/autovector.h"
#include "util/mutable_cf_options.h"
#include "db/version_set.h"
namespace rocksdb {
@ -22,15 +23,23 @@ struct CompactionInputFiles {
inline bool empty() const { return files.empty(); }
inline size_t size() const { return files.size(); }
inline void clear() { files.clear(); }
inline FileMetaData* operator[](int i) const { return files[i]; }
inline FileMetaData* operator[](size_t i) const { return files[i]; }
};
class Version;
class ColumnFamilyData;
class VersionStorageInfo;
// A Compaction encapsulates information about a compaction.
class Compaction {
public:
Compaction(VersionStorageInfo* input_version,
const autovector<CompactionInputFiles>& inputs,
int start_level, int output_level,
uint64_t max_grandparent_overlap_bytes,
const CompactionOptions& options,
bool deletion_compaction);
// No copying allowed
Compaction(const Compaction&) = delete;
void operator=(const Compaction&) = delete;
@ -39,7 +48,7 @@ class Compaction {
// Returns the level associated to the specified compaction input level.
// If compaction_input_level is not specified, then input_level is set to 0.
int level(int compaction_input_level = 0) const {
int level(size_t compaction_input_level = 0) const {
return inputs_[compaction_input_level].level;
}
@ -47,7 +56,7 @@ class Compaction {
int output_level() const { return output_level_; }
// Returns the number of input levels in this compaction.
int num_input_levels() const { return inputs_.size(); }
size_t num_input_levels() const { return inputs_.size(); }
// Return the object that holds the edits to the descriptor done
// by this compaction.
@ -57,7 +66,7 @@ class Compaction {
// compaction input level.
// The function will return 0 if when "compaction_input_level" < 0
// or "compaction_input_level" >= "num_input_levels()".
int num_input_files(size_t compaction_input_level) const {
size_t num_input_files(size_t compaction_input_level) const {
if (compaction_input_level < inputs_.size()) {
return inputs_[compaction_input_level].size();
}
@ -74,7 +83,7 @@ class Compaction {
// specified compaction input level.
// REQUIREMENT: "compaction_input_level" must be >= 0 and
// < "input_levels()"
FileMetaData* input(size_t compaction_input_level, int i) const {
FileMetaData* input(size_t compaction_input_level, size_t i) const {
assert(compaction_input_level < inputs_.size());
return inputs_[compaction_input_level][i];
}
@ -88,8 +97,8 @@ class Compaction {
return &inputs_[compaction_input_level].files;
}
// Returns the FileLevel of the specified compaction input level.
FileLevel* input_levels(int compaction_input_level) {
// Returns the LevelFilesBrief of the specified compaction input level.
LevelFilesBrief* input_levels(size_t compaction_input_level) {
return &input_levels_[compaction_input_level];
}
@ -110,7 +119,7 @@ class Compaction {
// moving a single input file to the next level (no merging or splitting)
bool IsTrivialMove() const;
// If true, then the comaction can be done by simply deleting input files.
// If true, then the compaction can be done by simply deleting input files.
bool IsDeletionCompaction() const {
return deletion_compaction_;
}
@ -126,10 +135,6 @@ class Compaction {
// before processing "internal_key".
bool ShouldStopBefore(const Slice& internal_key);
// Release the input version for the compaction, once the compaction
// is successful.
void ReleaseInputs();
// Clear all files to indicate that they are not being compacted
// Delete this compaction from the list of running compactions.
void ReleaseCompactionFiles(Status status);
@ -151,10 +156,38 @@ class Compaction {
// Was this compaction triggered manually by the client?
bool IsManualCompaction() { return is_manual_compaction_; }
void SetOutputPathId(uint32_t path_id) { output_path_id_ = path_id; }
// Return the MutableCFOptions that should be used throughout the compaction
// procedure
const MutableCFOptions* mutable_cf_options() { return &mutable_cf_options_; }
// Returns the size in bytes that the output file should be preallocated to.
// In level compaction, that is max_file_size_. In universal compaction, that
// is the sum of all input file sizes.
uint64_t OutputFilePreallocationSize();
uint64_t OutputFilePreallocationSize(const MutableCFOptions& mutable_options);
void SetInputVersion(Version* input_version);
// mark (or clear) all files that are being compacted
void MarkFilesBeingCompacted(bool mark_as_compacted);
// Initialize whether the compaction is producing files at the
// bottommost level.
//
// @see BottomMostLevel()
void SetupBottomMostLevel(VersionStorageInfo* vstorage, bool is_manual,
bool level0_only);
static Compaction* TEST_NewCompaction(
int num_levels, int start_level, int out_level, uint64_t target_file_size,
uint64_t max_grandparent_overlap_bytes, uint32_t output_path_id,
CompressionType output_compression, bool seek_compaction = false,
bool deletion_compaction = false);
CompactionInputFiles* TEST_GetInputFiles(int l) {
return &inputs_[l];
}
private:
friend class CompactionPicker;
@ -162,7 +195,7 @@ class Compaction {
friend class FIFOCompactionPicker;
friend class LevelCompactionPicker;
Compaction(Version* input_version, int start_level, int out_level,
Compaction(int num_levels, int start_level, int out_level,
uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
uint32_t output_path_id, CompressionType output_compression,
bool seek_compaction = false, bool deletion_compaction = false);
@ -171,6 +204,7 @@ class Compaction {
const int output_level_; // levels to which output files are stored
uint64_t max_output_file_size_;
uint64_t max_grandparent_overlap_bytes_;
MutableCFOptions mutable_cf_options_;
Version* input_version_;
VersionEdit* edit_;
int number_levels_;
@ -187,7 +221,7 @@ class Compaction {
autovector<CompactionInputFiles> inputs_;
// A copy of inputs_, organized more closely in memory
autovector<FileLevel, 2> input_levels_;
autovector<LevelFilesBrief, 2> input_levels_;
// State used to check for number of of overlapping grandparent files
// (grandparent == "output_level_ + 1")
@ -217,15 +251,6 @@ class Compaction {
// records indices for all levels beyond "output_level_".
std::vector<size_t> level_ptrs_;
// mark (or clear) all files that are being compacted
void MarkFilesBeingCompacted(bool mark_as_compacted);
// Initialize whether the compaction is producing files at the
// bottommost level.
//
// @see BottomMostLevel()
void SetupBottomMostLevel(bool is_manual);
// In case of compaction error, reset the nextIndex that is used
// to pick up the next file to be compacted from files_by_size_
void ResetNextCompactionIndex();

File diff suppressed because it is too large Load Diff

@ -0,0 +1,127 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <atomic>
#include <deque>
#include <limits>
#include <set>
#include <utility>
#include <vector>
#include <string>
#include <functional>
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
#include "db/column_family.h"
#include "db/version_edit.h"
#include "db/memtable_list.h"
#include "port/port.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/transaction_log.h"
#include "util/autovector.h"
#include "util/stop_watch.h"
#include "util/thread_local.h"
#include "util/scoped_arena_iterator.h"
#include "db/internal_stats.h"
#include "db/write_controller.h"
#include "db/flush_scheduler.h"
#include "db/write_thread.h"
#include "db/job_context.h"
namespace rocksdb {
class MemTable;
class TableCache;
class Version;
class VersionEdit;
class VersionSet;
class Arena;
class CompactionJob {
public:
// TODO(icanadi) make effort to reduce number of parameters here
// IMPORTANT: mutable_cf_options needs to be alive while CompactionJob is
// alive
CompactionJob(Compaction* compaction, const DBOptions& db_options,
const MutableCFOptions& mutable_cf_options,
const EnvOptions& env_options, VersionSet* versions,
std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
Directory* db_directory, Directory* output_directory,
Statistics* stats, SnapshotList* snapshot_list,
bool is_snapshot_supported, std::shared_ptr<Cache> table_cache,
std::function<uint64_t()> yield_callback);
~CompactionJob() { assert(compact_ == nullptr); }
// no copy/move
CompactionJob(CompactionJob&& job) = delete;
CompactionJob(const CompactionJob& job) = delete;
CompactionJob& operator=(const CompactionJob& job) = delete;
// REQUIRED: mutex held
void Prepare();
// REQUIRED mutex not held
Status Run();
// REQUIRED: mutex held
// status is the return of Run()
void Install(Status* status, InstrumentedMutex* db_mutex);
private:
void AllocateCompactionOutputFileNumbers();
// Call compaction filter if is_compaction_v2 is not true. Then iterate
// through input and compact the kv-pairs
Status ProcessKeyValueCompaction(int64_t* imm_micros, Iterator* input,
bool is_compaction_v2);
// Call compaction_filter_v2->Filter() on kv-pairs in compact
void CallCompactionFilterV2(CompactionFilterV2* compaction_filter_v2);
Status FinishCompactionOutputFile(Iterator* input);
Status InstallCompactionResults(InstrumentedMutex* db_mutex);
SequenceNumber findEarliestVisibleSnapshot(
SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
SequenceNumber* prev_snapshot);
void RecordCompactionIOStats();
Status OpenCompactionOutputFile();
void CleanupCompaction(const Status& status);
// CompactionJob state
struct CompactionState;
CompactionState* compact_;
bool bottommost_level_;
SequenceNumber earliest_snapshot_;
SequenceNumber visible_at_tip_;
SequenceNumber latest_snapshot_;
InternalStats::CompactionStats compaction_stats_;
// DBImpl state
const DBOptions& db_options_;
const MutableCFOptions& mutable_cf_options_;
const EnvOptions& env_options_;
Env* env_;
VersionSet* versions_;
std::atomic<bool>* shutting_down_;
LogBuffer* log_buffer_;
Directory* db_directory_;
Directory* output_directory_;
Statistics* stats_;
SnapshotList* snapshots_;
bool is_snapshot_supported_;
std::shared_ptr<Cache> table_cache_;
// yield callback
std::function<uint64_t()> yield_callback_;
};
} // namespace rocksdb

@ -0,0 +1,183 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <map>
#include <string>
#include "db/compaction_job.h"
#include "db/column_family.h"
#include "db/version_set.h"
#include "db/writebuffer.h"
#include "rocksdb/cache.h"
#include "rocksdb/options.h"
#include "rocksdb/db.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "table/mock_table.h"
namespace rocksdb {
// TODO(icanadi) Make it simpler once we mock out VersionSet
class CompactionJobTest {
public:
CompactionJobTest()
: env_(Env::Default()),
dbname_(test::TmpDir() + "/compaction_job_test"),
mutable_cf_options_(Options(), ImmutableCFOptions(Options())),
table_cache_(NewLRUCache(50000, 16, 8)),
write_buffer_(db_options_.db_write_buffer_size),
versions_(new VersionSet(dbname_, &db_options_, env_options_,
table_cache_.get(), &write_buffer_,
&write_controller_)),
shutting_down_(false),
mock_table_factory_(new mock::MockTableFactory()) {
ASSERT_OK(env_->CreateDirIfMissing(dbname_));
db_options_.db_paths.emplace_back(dbname_,
std::numeric_limits<uint64_t>::max());
NewDB();
std::vector<ColumnFamilyDescriptor> column_families;
cf_options_.table_factory = mock_table_factory_;
column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
ASSERT_OK(versions_->Recover(column_families, false));
}
std::string GenerateFileName(uint64_t file_number) {
FileMetaData meta;
std::vector<DbPath> db_paths;
db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
meta.fd = FileDescriptor(file_number, 0, 0);
return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
}
// returns expected result after compaction
mock::MockFileContents CreateTwoFiles() {
mock::MockFileContents expected_results;
const int kKeysPerFile = 10000;
SequenceNumber sequence_number = 0;
for (int i = 0; i < 2; ++i) {
mock::MockFileContents contents;
SequenceNumber smallest_seqno = 0, largest_seqno = 0;
InternalKey smallest, largest;
for (int k = 0; k < kKeysPerFile; ++k) {
auto key = ToString(i * (kKeysPerFile / 2) + k);
auto value = ToString(i * kKeysPerFile + k);
InternalKey internal_key(key, ++sequence_number, kTypeValue);
if (k == 0) {
smallest = internal_key;
smallest_seqno = sequence_number;
} else if (k == kKeysPerFile - 1) {
largest = internal_key;
largest_seqno = sequence_number;
}
std::pair<std::string, std::string> key_value(
{internal_key.Encode().ToString(), value});
contents.insert(key_value);
if (i == 1 || k < kKeysPerFile / 2) {
expected_results.insert(key_value);
}
}
uint64_t file_number = versions_->NewFileNumber();
ASSERT_OK(mock_table_factory_->CreateMockTable(
env_, GenerateFileName(file_number), std::move(contents)));
VersionEdit edit;
edit.AddFile(0, file_number, 0, 10, smallest, largest, smallest_seqno,
largest_seqno);
mutex_.Lock();
versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
mutable_cf_options_, &edit, &mutex_);
mutex_.Unlock();
}
versions_->SetLastSequence(sequence_number);
return expected_results;
}
void NewDB() {
VersionEdit new_db;
new_db.SetLogNumber(0);
new_db.SetNextFile(2);
new_db.SetLastSequence(0);
const std::string manifest = DescriptorFileName(dbname_, 1);
unique_ptr<WritableFile> file;
Status s = env_->NewWritableFile(
manifest, &file, env_->OptimizeForManifestWrite(env_options_));
ASSERT_OK(s);
{
log::Writer log(std::move(file));
std::string record;
new_db.EncodeTo(&record);
s = log.AddRecord(record);
}
ASSERT_OK(s);
// Make "CURRENT" file that points to the new manifest file.
s = SetCurrentFile(env_, dbname_, 1, nullptr);
}
Env* env_;
std::string dbname_;
EnvOptions env_options_;
MutableCFOptions mutable_cf_options_;
std::shared_ptr<Cache> table_cache_;
WriteController write_controller_;
DBOptions db_options_;
ColumnFamilyOptions cf_options_;
WriteBuffer write_buffer_;
std::unique_ptr<VersionSet> versions_;
InstrumentedMutex mutex_;
std::atomic<bool> shutting_down_;
std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
};
TEST(CompactionJobTest, Simple) {
auto cfd = versions_->GetColumnFamilySet()->GetDefault();
auto expected_results = CreateTwoFiles();
auto files = cfd->current()->storage_info()->LevelFiles(0);
ASSERT_EQ(2U, files.size());
std::unique_ptr<Compaction> compaction(Compaction::TEST_NewCompaction(
7, 0, 1, 1024 * 1024, 10, 0, kNoCompression));
compaction->SetInputVersion(cfd->current());
auto compaction_input_files = compaction->TEST_GetInputFiles(0);
compaction_input_files->level = 0;
compaction_input_files->files.push_back(files[0]);
compaction_input_files->files.push_back(files[1]);
SnapshotList snapshots;
int yield_callback_called = 0;
std::function<uint64_t()> yield_callback = [&]() {
yield_callback_called++;
return 0;
};
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
mutex_.Lock();
CompactionJob compaction_job(compaction.get(), db_options_,
*cfd->GetLatestMutableCFOptions(), env_options_,
versions_.get(), &shutting_down_, &log_buffer,
nullptr, nullptr, nullptr, &snapshots, true,
table_cache_, std::move(yield_callback));
compaction_job.Prepare();
mutex_.Unlock();
ASSERT_OK(compaction_job.Run());
mutex_.Lock();
Status s;
compaction_job.Install(&s, &mutex_);
ASSERT_OK(s);
mutex_.Unlock();
mock_table_factory_->AssertLatestFile(expected_results);
ASSERT_EQ(yield_callback_called, 20000);
}
} // namespace rocksdb
int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

File diff suppressed because it is too large Load Diff

@ -8,32 +8,43 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <vector>
#include <memory>
#include <set>
#include <unordered_set>
#include "db/version_set.h"
#include "db/compaction.h"
#include "rocksdb/status.h"
#include "rocksdb/options.h"
#include "rocksdb/env.h"
#include "util/mutable_cf_options.h"
#include <vector>
#include <memory>
#include <set>
#include <string>
namespace rocksdb {
class LogBuffer;
class Compaction;
class Version;
class VersionStorageInfo;
struct CompactionInputFiles;
class CompactionPicker {
public:
CompactionPicker(const Options* options, const InternalKeyComparator* icmp);
CompactionPicker(const ImmutableCFOptions& ioptions,
const InternalKeyComparator* icmp);
virtual ~CompactionPicker();
// Pick level and inputs for a new compaction.
// Returns nullptr if there is no compaction to be done.
// Otherwise returns a pointer to a heap-allocated object that
// describes the compaction. Caller should delete the result.
virtual Compaction* PickCompaction(Version* version,
virtual Compaction* PickCompaction(const std::string& cf_name,
const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage,
LogBuffer* log_buffer) = 0;
// Return a compaction object for compacting the range [begin,end] in
@ -47,36 +58,59 @@ class CompactionPicker {
// compaction_end will be set to nullptr.
// Client is responsible for compaction_end storage -- when called,
// *compaction_end should point to valid InternalKey!
virtual Compaction* CompactRange(Version* version, int input_level,
int output_level, uint32_t output_path_id,
const InternalKey* begin,
const InternalKey* end,
InternalKey** compaction_end);
virtual Compaction* CompactRange(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage, int input_level, int output_level,
uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
InternalKey** compaction_end);
// Given the current number of levels, returns the lowest allowed level
// for compaction input.
virtual int MaxInputLevel(int current_num_levels) const = 0;
// Free up the files that participated in a compaction
void ReleaseCompactionFiles(Compaction* c, Status status);
// Return the total amount of data that is undergoing
// compactions per level
void SizeBeingCompacted(std::vector<uint64_t>& sizes);
// The maximum allowed output level. Default value is NumberLevels() - 1.
virtual int MaxOutputLevel() const {
return NumberLevels() - 1;
}
// Returns maximum total overlap bytes with grandparent
// level (i.e., level+2) before we stop building a single
// file in level->level+1 compaction.
uint64_t MaxGrandParentOverlapBytes(int level);
virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0;
// Sanitize the input set of compaction input files.
// When the input parameters do not describe a valid compaction, the
// function will try to fix the input_files by adding necessary
// files. If it's not possible to conver an invalid input_files
// into a valid one by adding more files, the function will return a
// non-ok status with specific reason.
#ifndef ROCKSDB_LITE
Status SanitizeCompactionInputFiles(
std::unordered_set<uint64_t>* input_files,
const ColumnFamilyMetaData& cf_meta,
const int output_level) const;
#endif // ROCKSDB_LITE
// Returns maximum total bytes of data on a given level.
double MaxBytesForLevel(int level);
// Free up the files that participated in a compaction
void ReleaseCompactionFiles(Compaction* c, Status status);
// Get the max file size in a given level.
uint64_t MaxFileSizeForLevel(int level) const;
// Returns true if any one of the specified files are being compacted
bool FilesInCompaction(const std::vector<FileMetaData*>& files);
// Takes a list of CompactionInputFiles and returns a Compaction object.
Compaction* FormCompaction(
const CompactionOptions& compact_options,
const autovector<CompactionInputFiles>& input_files,
int output_level, VersionStorageInfo* vstorage,
const MutableCFOptions& mutable_cf_options) const;
// Converts a set of compaction input file numbers into
// a list of CompactionInputFiles.
Status GetCompactionInputsFromFileNumbers(
autovector<CompactionInputFiles>* input_files,
std::unordered_set<uint64_t>* input_set,
const VersionStorageInfo* vstorage,
const CompactionOptions& compact_options) const;
protected:
int NumberLevels() const { return num_levels_; }
int NumberLevels() const { return ioptions_.num_levels; }
// Stores the minimal range that covers all entries in inputs in
// *smallest, *largest.
@ -101,110 +135,179 @@ class CompactionPicker {
// populated.
//
// Will return false if it is impossible to apply this compaction.
bool ExpandWhileOverlapping(Compaction* c);
uint64_t ExpandedCompactionByteSizeLimit(int level);
// Returns true if any one of the specified files are being compacted
bool FilesInCompaction(std::vector<FileMetaData*>& files);
bool ExpandWhileOverlapping(const std::string& cf_name,
VersionStorageInfo* vstorage, Compaction* c);
// Returns true if any one of the parent files are being compacted
bool ParentRangeInCompaction(Version* version, const InternalKey* smallest,
bool ParentRangeInCompaction(VersionStorageInfo* vstorage,
const InternalKey* smallest,
const InternalKey* largest, int level,
int* index);
void SetupOtherInputs(Compaction* c);
void SetupOtherInputs(const std::string& cf_name,
const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage, Compaction* c);
const ImmutableCFOptions& ioptions_;
// A helper function to SanitizeCompactionInputFiles() that
// sanitizes "input_files" by adding necessary files.
#ifndef ROCKSDB_LITE
virtual Status SanitizeCompactionInputFilesForAllLevels(
std::unordered_set<uint64_t>* input_files,
const ColumnFamilyMetaData& cf_meta,
const int output_level) const;
#endif // ROCKSDB_LITE
// record all the ongoing compactions for all levels
std::vector<std::set<Compaction*>> compactions_in_progress_;
// Per-level target file size.
std::unique_ptr<uint64_t[]> max_file_size_;
const InternalKeyComparator* const icmp_;
};
class LevelCompactionPicker : public CompactionPicker {
public:
LevelCompactionPicker(const ImmutableCFOptions& ioptions,
const InternalKeyComparator* icmp)
: CompactionPicker(ioptions, icmp) {}
virtual Compaction* PickCompaction(const std::string& cf_name,
const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage,
LogBuffer* log_buffer) override;
// Per-level max bytes
std::unique_ptr<uint64_t[]> level_max_bytes_;
// Returns current_num_levels - 2, meaning the last level cannot be
// compaction input level.
virtual int MaxInputLevel(int current_num_levels) const override {
return current_num_levels - 2;
}
const Options* const options_;
virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
override;
private:
int num_levels_;
// Pick a path ID to place a newly generated file, with its level
static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
const MutableCFOptions& mutable_cf_options,
int level);
const InternalKeyComparator* const icmp_;
private:
// For the specfied level, pick a compaction.
// Returns nullptr if there is no compaction to be done.
// If level is 0 and there is already a compaction on that level, this
// function will return nullptr.
Compaction* PickCompactionBySize(const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage, int level,
double score);
};
#ifndef ROCKSDB_LITE
class UniversalCompactionPicker : public CompactionPicker {
public:
UniversalCompactionPicker(const Options* options,
UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
const InternalKeyComparator* icmp)
: CompactionPicker(options, icmp) {}
virtual Compaction* PickCompaction(Version* version,
: CompactionPicker(ioptions, icmp) {}
virtual Compaction* PickCompaction(const std::string& cf_name,
const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage,
LogBuffer* log_buffer) override;
// The maxinum allowed input level. Always return 0.
// The maxinum allowed input level. Always returns 0.
virtual int MaxInputLevel(int current_num_levels) const override {
return 0;
}
// The maximum allowed output level. Always returns 0.
virtual int MaxOutputLevel() const override {
return 0;
}
virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
override;
private:
// Pick Universal compaction to limit read amplification
Compaction* PickCompactionUniversalReadAmp(Version* version, double score,
unsigned int ratio,
unsigned int num_files,
LogBuffer* log_buffer);
Compaction* PickCompactionUniversalReadAmp(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage, double score, unsigned int ratio,
unsigned int num_files, LogBuffer* log_buffer);
// Pick Universal compaction to limit space amplification.
Compaction* PickCompactionUniversalSizeAmp(Version* version, double score,
LogBuffer* log_buffer);
Compaction* PickCompactionUniversalSizeAmp(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage, double score, LogBuffer* log_buffer);
// Pick a path ID to place a newly generated file, with its estimated file
// size.
static uint32_t GetPathId(const Options& options, uint64_t file_size);
static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
uint64_t file_size);
};
class LevelCompactionPicker : public CompactionPicker {
class FIFOCompactionPicker : public CompactionPicker {
public:
LevelCompactionPicker(const Options* options,
const InternalKeyComparator* icmp)
: CompactionPicker(options, icmp) {}
virtual Compaction* PickCompaction(Version* version,
FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
const InternalKeyComparator* icmp)
: CompactionPicker(ioptions, icmp) {}
virtual Compaction* PickCompaction(const std::string& cf_name,
const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* version,
LogBuffer* log_buffer) override;
// Returns current_num_levels - 2, meaning the last level cannot be
// compaction input level.
virtual Compaction* CompactRange(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage, int input_level, int output_level,
uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
InternalKey** compaction_end) override;
// The maxinum allowed input level. Always returns 0.
virtual int MaxInputLevel(int current_num_levels) const override {
return current_num_levels - 2;
return 0;
}
private:
// For the specfied level, pick a compaction.
// Returns nullptr if there is no compaction to be done.
// If level is 0 and there is already a compaction on that level, this
// function will return nullptr.
Compaction* PickCompactionBySize(Version* version, int level, double score);
// The maximum allowed output level. Always returns 0.
virtual int MaxOutputLevel() const override {
return 0;
}
virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
override;
};
class FIFOCompactionPicker : public CompactionPicker {
class NullCompactionPicker : public CompactionPicker {
public:
FIFOCompactionPicker(const Options* options,
const InternalKeyComparator* icmp)
: CompactionPicker(options, icmp) {}
NullCompactionPicker(const ImmutableCFOptions& ioptions,
const InternalKeyComparator* icmp) :
CompactionPicker(ioptions, icmp) {}
virtual ~NullCompactionPicker() {}
// Always return "nullptr"
Compaction* PickCompaction(const std::string& cf_name,
const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage,
LogBuffer* log_buffer) override {
return nullptr;
}
virtual Compaction* PickCompaction(Version* version,
LogBuffer* log_buffer) override;
// Always return "nullptr"
Compaction* CompactRange(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage, int input_level, int output_level,
uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
InternalKey** compaction_end) override {
return nullptr;
}
virtual Compaction* CompactRange(Version* version, int input_level,
int output_level, uint32_t output_path_id,
const InternalKey* begin,
const InternalKey* end,
InternalKey** compaction_end) override;
// Given the current number of levels, returns the highest allowed level
// for compaction input.
virtual int MaxInputLevel(int current_num_levels) const {
return current_num_levels - 2;
}
// The maxinum allowed input level. Always return 0.
virtual int MaxInputLevel(int current_num_levels) const override {
return 0;
// Always returns false.
virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
override {
return false;
}
};
// Utility function
extern uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files);
#endif // !ROCKSDB_LITE
} // namespace rocksdb

@ -0,0 +1,259 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "db/compaction_picker.h"
#include <limits>
#include <string>
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace rocksdb {
class CountingLogger : public Logger {
public:
using Logger::Logv;
virtual void Logv(const char* format, va_list ap) override { log_count++; }
size_t log_count;
};
class CompactionPickerTest {
public:
const Comparator* ucmp_;
InternalKeyComparator icmp_;
Options options_;
ImmutableCFOptions ioptions_;
MutableCFOptions mutable_cf_options_;
LevelCompactionPicker level_compaction_picker;
std::string cf_name_;
CountingLogger logger_;
LogBuffer log_buffer_;
uint32_t file_num_;
CompactionOptionsFIFO fifo_options_;
std::unique_ptr<VersionStorageInfo> vstorage_;
std::vector<std::unique_ptr<FileMetaData>> files_;
CompactionPickerTest()
: ucmp_(BytewiseComparator()),
icmp_(ucmp_),
ioptions_(options_),
mutable_cf_options_(options_, ioptions_),
level_compaction_picker(ioptions_, &icmp_),
cf_name_("dummy"),
log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
file_num_(1),
vstorage_(nullptr) {
fifo_options_.max_table_files_size = 1;
mutable_cf_options_.RefreshDerivedOptions(ioptions_);
ioptions_.db_paths.emplace_back("dummy",
std::numeric_limits<uint64_t>::max());
}
~CompactionPickerTest() {
}
void NewVersionStorage(int num_levels, CompactionStyle style) {
DeleteVersionStorage();
options_.num_levels = num_levels;
vstorage_.reset(new VersionStorageInfo(
&icmp_, ucmp_, options_.num_levels, style, nullptr));
}
void DeleteVersionStorage() {
vstorage_.reset();
files_.clear();
}
void Add(int level, uint32_t file_number, const char* smallest,
const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
SequenceNumber smallest_seq = 100,
SequenceNumber largest_seq = 100) {
assert(level < vstorage_->num_levels());
FileMetaData* f = new FileMetaData;
f->fd = FileDescriptor(file_number, path_id, file_size);
f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
f->largest = InternalKey(largest, largest_seq, kTypeValue);
f->compensated_file_size = file_size;
f->refs = 0;
vstorage_->AddFile(level, f);
files_.emplace_back(f);
}
void UpdateVersionStorageInfo() {
vstorage_->UpdateFilesBySize();
vstorage_->UpdateNumNonEmptyLevels();
vstorage_->GenerateFileIndexer();
vstorage_->GenerateLevelFilesBrief();
vstorage_->ComputeCompactionScore(mutable_cf_options_, fifo_options_);
vstorage_->SetFinalized();
}
};
TEST(CompactionPickerTest, Empty) {
NewVersionStorage(6, kCompactionStyleLevel);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
ASSERT_TRUE(compaction.get() == nullptr);
}
TEST(CompactionPickerTest, Single) {
NewVersionStorage(6, kCompactionStyleLevel);
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
Add(0, 1U, "p", "q");
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
ASSERT_TRUE(compaction.get() == nullptr);
}
TEST(CompactionPickerTest, Level0Trigger) {
NewVersionStorage(6, kCompactionStyleLevel);
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
Add(0, 1U, "150", "200");
Add(0, 2U, "200", "250");
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(2U, compaction->num_input_files(0));
ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
}
TEST(CompactionPickerTest, Level1Trigger) {
NewVersionStorage(6, kCompactionStyleLevel);
Add(1, 66U, "150", "200", 1000000000U);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(1U, compaction->num_input_files(0));
ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
}
TEST(CompactionPickerTest, Level1Trigger2) {
NewVersionStorage(6, kCompactionStyleLevel);
Add(1, 66U, "150", "200", 1000000001U);
Add(1, 88U, "201", "300", 1000000000U);
Add(2, 6U, "150", "179", 1000000000U);
Add(2, 7U, "180", "220", 1000000000U);
Add(2, 8U, "221", "300", 1000000000U);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(1U, compaction->num_input_files(0));
ASSERT_EQ(2U, compaction->num_input_files(1));
ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
}
TEST(CompactionPickerTest, LevelMaxScore) {
NewVersionStorage(6, kCompactionStyleLevel);
mutable_cf_options_.target_file_size_base = 10000000;
mutable_cf_options_.target_file_size_multiplier = 10;
Add(0, 1U, "150", "200", 1000000000U);
// Level 1 score 1.2
Add(1, 66U, "150", "200", 6000000U);
Add(1, 88U, "201", "300", 6000000U);
// Level 2 score 1.8. File 7 is the largest. Should be picked
Add(2, 6U, "150", "179", 60000000U);
Add(2, 7U, "180", "220", 60000001U);
Add(2, 8U, "221", "300", 60000000U);
// Level 3 score slightly larger than 1
Add(3, 26U, "150", "170", 260000000U);
Add(3, 27U, "171", "179", 260000000U);
Add(3, 28U, "191", "220", 260000000U);
Add(3, 29U, "221", "300", 260000000U);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(1U, compaction->num_input_files(0));
ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
}
TEST(CompactionPickerTest, NeedsCompactionLevel) {
const int kLevels = 6;
const int kFileCount = 20;
for (int level = 0; level < kLevels - 1; ++level) {
uint64_t file_size =
mutable_cf_options_.MaxBytesForLevel(level) * 2 / kFileCount;
for (int file_count = 1; file_count <= kFileCount; ++file_count) {
// start a brand new version in each test.
NewVersionStorage(kLevels, kCompactionStyleLevel);
for (int i = 0; i < file_count; ++i) {
Add(level, i, ToString((i + 100) * 1000).c_str(),
ToString((i + 100) * 1000 + 999).c_str(),
file_size, 0, i * 100, i * 100 + 99);
}
UpdateVersionStorageInfo();
ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level);
ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
vstorage_->CompactionScore(0) >= 1);
// release the version storage
DeleteVersionStorage();
}
}
}
TEST(CompactionPickerTest, NeedsCompactionUniversal) {
NewVersionStorage(1, kCompactionStyleUniversal);
UniversalCompactionPicker universal_compaction_picker(
ioptions_, &icmp_);
// must return false when there's no files.
ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
false);
// verify the trigger given different number of L0 files.
for (int i = 1;
i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) {
Add(0, i, ToString((i + 100) * 1000).c_str(),
ToString((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100,
i * 100 + 99);
ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
vstorage_->CompactionScore(0) >= 1);
}
}
TEST(CompactionPickerTest, NeedsCompactionFIFO) {
NewVersionStorage(1, kCompactionStyleFIFO);
const int kFileCount =
mutable_cf_options_.level0_file_num_compaction_trigger * 3;
const uint64_t kFileSize = 100000;
const uint64_t kMaxSize = kFileSize * kFileCount / 2;
fifo_options_.max_table_files_size = kMaxSize;
ioptions_.compaction_options_fifo = fifo_options_;
FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
// must return false when there's no files.
ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false);
// verify whether compaction is needed based on the current
// size of L0 files.
uint64_t current_size = 0;
for (int i = 1; i <= kFileCount; ++i) {
Add(0, i, ToString((i + 100) * 1000).c_str(),
ToString((i + 100) * 1000 + 999).c_str(),
kFileSize, 0, i * 100, i * 100 + 99);
current_size += kFileSize;
ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
vstorage_->CompactionScore(0) >= 1);
}
}
} // namespace rocksdb
int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

@ -0,0 +1,434 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <map>
#include <string>
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "util/hash.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "utilities/merge_operators.h"
using std::unique_ptr;
namespace rocksdb {
namespace {
static const Comparator* comparator;
// A comparator for std::map, using comparator
struct MapComparator {
bool operator()(const std::string& a, const std::string& b) const {
return comparator->Compare(a, b) < 0;
}
};
typedef std::map<std::string, std::string, MapComparator> KVMap;
class KVIter : public Iterator {
public:
explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
virtual bool Valid() const { return iter_ != map_->end(); }
virtual void SeekToFirst() { iter_ = map_->begin(); }
virtual void SeekToLast() {
if (map_->empty()) {
iter_ = map_->end();
} else {
iter_ = map_->find(map_->rbegin()->first);
}
}
virtual void Seek(const Slice& k) { iter_ = map_->lower_bound(k.ToString()); }
virtual void Next() { ++iter_; }
virtual void Prev() {
if (iter_ == map_->begin()) {
iter_ = map_->end();
return;
}
--iter_;
}
virtual Slice key() const { return iter_->first; }
virtual Slice value() const { return iter_->second; }
virtual Status status() const { return Status::OK(); }
private:
const KVMap* const map_;
KVMap::const_iterator iter_;
};
void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
ASSERT_EQ(iter1->Valid(), iter2->Valid());
if (iter1->Valid()) {
ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
}
}
// Measuring operations on DB (expect to be empty).
// source_strings are candidate keys
void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
Random* rnd, int num_writes, int num_iter_ops,
int num_trigger_flush) {
KVMap map;
for (int i = 0; i < num_writes; i++) {
if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) {
db->Flush(FlushOptions());
}
int type = rnd->Uniform(2);
int index = rnd->Uniform(static_cast<int>(source_strings.size()));
auto& key = source_strings[index];
switch (type) {
case 0:
// put
map[key] = key;
ASSERT_OK(db->Put(WriteOptions(), key, key));
break;
case 1:
// delete
if (map.find(key) != map.end()) {
map.erase(key);
}
ASSERT_OK(db->Delete(WriteOptions(), key));
break;
default:
assert(false);
}
}
std::unique_ptr<Iterator> iter(db->NewIterator(ReadOptions()));
std::unique_ptr<Iterator> result_iter(new KVIter(&map));
bool is_valid = false;
for (int i = 0; i < num_iter_ops; i++) {
// Random walk and make sure iter and result_iter returns the
// same key and value
int type = rnd->Uniform(6);
ASSERT_OK(iter->status());
switch (type) {
case 0:
// Seek to First
iter->SeekToFirst();
result_iter->SeekToFirst();
break;
case 1:
// Seek to last
iter->SeekToLast();
result_iter->SeekToLast();
break;
case 2: {
// Seek to random key
auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
auto key = source_strings[key_idx];
iter->Seek(key);
result_iter->Seek(key);
break;
}
case 3:
// Next
if (is_valid) {
iter->Next();
result_iter->Next();
} else {
continue;
}
break;
case 4:
// Prev
if (is_valid) {
iter->Prev();
result_iter->Prev();
} else {
continue;
}
break;
default: {
assert(type == 5);
auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
auto key = source_strings[key_idx];
std::string result;
auto status = db->Get(ReadOptions(), key, &result);
if (map.find(key) == map.end()) {
ASSERT_TRUE(status.IsNotFound());
} else {
ASSERT_EQ(map[key], result);
}
break;
}
}
AssertItersEqual(iter.get(), result_iter.get());
is_valid = iter->Valid();
}
}
class DoubleComparator : public Comparator {
public:
DoubleComparator() {}
virtual const char* Name() const { return "DoubleComparator"; }
virtual int Compare(const Slice& a, const Slice& b) const {
double da = std::stod(a.ToString());
double db = std::stod(b.ToString());
if (da == db) {
return a.compare(b);
} else if (da > db) {
return 1;
} else {
return -1;
}
}
virtual void FindShortestSeparator(std::string* start,
const Slice& limit) const {}
virtual void FindShortSuccessor(std::string* key) const {}
};
class HashComparator : public Comparator {
public:
HashComparator() {}
virtual const char* Name() const { return "HashComparator"; }
virtual int Compare(const Slice& a, const Slice& b) const {
uint32_t ha = Hash(a.data(), a.size(), 66);
uint32_t hb = Hash(b.data(), b.size(), 66);
if (ha == hb) {
return a.compare(b);
} else if (ha > hb) {
return 1;
} else {
return -1;
}
}
virtual void FindShortestSeparator(std::string* start,
const Slice& limit) const {}
virtual void FindShortSuccessor(std::string* key) const {}
};
class TwoStrComparator : public Comparator {
public:
TwoStrComparator() {}
virtual const char* Name() const { return "TwoStrComparator"; }
virtual int Compare(const Slice& a, const Slice& b) const {
assert(a.size() >= 2);
assert(b.size() >= 2);
size_t size_a1 = static_cast<size_t>(a[0]);
size_t size_b1 = static_cast<size_t>(b[0]);
size_t size_a2 = static_cast<size_t>(a[1]);
size_t size_b2 = static_cast<size_t>(b[1]);
assert(size_a1 + size_a2 + 2 == a.size());
assert(size_b1 + size_b2 + 2 == b.size());
Slice a1 = Slice(a.data() + 2, size_a1);
Slice b1 = Slice(b.data() + 2, size_b1);
Slice a2 = Slice(a.data() + 2 + size_a1, size_a2);
Slice b2 = Slice(b.data() + 2 + size_b1, size_b2);
if (a1 != b1) {
return a1.compare(b1);
}
return a2.compare(b2);
}
virtual void FindShortestSeparator(std::string* start,
const Slice& limit) const {}
virtual void FindShortSuccessor(std::string* key) const {}
};
} // namespace
class ComparatorDBTest {
private:
std::string dbname_;
Env* env_;
DB* db_;
Options last_options_;
std::unique_ptr<const Comparator> comparator_guard;
public:
ComparatorDBTest() : env_(Env::Default()), db_(nullptr) {
comparator = BytewiseComparator();
dbname_ = test::TmpDir() + "/comparator_db_test";
ASSERT_OK(DestroyDB(dbname_, last_options_));
}
~ComparatorDBTest() {
delete db_;
ASSERT_OK(DestroyDB(dbname_, last_options_));
comparator = BytewiseComparator();
}
DB* GetDB() { return db_; }
void SetOwnedComparator(const Comparator* cmp) {
comparator_guard.reset(cmp);
comparator = cmp;
last_options_.comparator = cmp;
}
// Return the current option configuration.
Options* GetOptions() { return &last_options_; }
void DestroyAndReopen() {
// Destroy using last options
Destroy();
ASSERT_OK(TryReopen());
}
void Destroy() {
delete db_;
db_ = nullptr;
ASSERT_OK(DestroyDB(dbname_, last_options_));
}
Status TryReopen() {
delete db_;
db_ = nullptr;
last_options_.create_if_missing = true;
return DB::Open(last_options_, dbname_, &db_);
}
};
TEST(ComparatorDBTest, Bytewise) {
for (int rand_seed = 301; rand_seed < 306; rand_seed++) {
DestroyAndReopen();
Random rnd(rand_seed);
DoRandomIteraratorTest(GetDB(),
{"a", "b", "c", "d", "e", "f", "g", "h", "i"}, &rnd,
8, 100, 3);
}
}
TEST(ComparatorDBTest, SimpleSuffixReverseComparator) {
SetOwnedComparator(new test::SimpleSuffixReverseComparator());
for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
Options* opt = GetOptions();
opt->comparator = comparator;
DestroyAndReopen();
Random rnd(rnd_seed);
std::vector<std::string> source_strings;
std::vector<std::string> source_prefixes;
// Randomly generate 5 prefixes
for (int i = 0; i < 5; i++) {
source_prefixes.push_back(test::RandomHumanReadableString(&rnd, 8));
}
for (int j = 0; j < 20; j++) {
int prefix_index = rnd.Uniform(static_cast<int>(source_prefixes.size()));
std::string key = source_prefixes[prefix_index] +
test::RandomHumanReadableString(&rnd, rnd.Uniform(8));
source_strings.push_back(key);
}
DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 30, 600, 66);
}
}
TEST(ComparatorDBTest, Uint64Comparator) {
SetOwnedComparator(test::Uint64Comparator());
for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
Options* opt = GetOptions();
opt->comparator = comparator;
DestroyAndReopen();
Random rnd(rnd_seed);
Random64 rnd64(rnd_seed);
std::vector<std::string> source_strings;
// Randomly generate source keys
for (int i = 0; i < 100; i++) {
uint64_t r = rnd64.Next();
std::string str;
str.resize(8);
memcpy(&str[0], static_cast<void*>(&r), 8);
source_strings.push_back(str);
}
DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
}
}
TEST(ComparatorDBTest, DoubleComparator) {
SetOwnedComparator(new DoubleComparator());
for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
Options* opt = GetOptions();
opt->comparator = comparator;
DestroyAndReopen();
Random rnd(rnd_seed);
std::vector<std::string> source_strings;
// Randomly generate source keys
for (int i = 0; i < 100; i++) {
uint32_t r = rnd.Next();
uint32_t divide_order = rnd.Uniform(8);
double to_divide = 1.0;
for (uint32_t j = 0; j < divide_order; j++) {
to_divide *= 10.0;
}
source_strings.push_back(ToString(r / to_divide));
}
DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
}
}
TEST(ComparatorDBTest, HashComparator) {
SetOwnedComparator(new HashComparator());
for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
Options* opt = GetOptions();
opt->comparator = comparator;
DestroyAndReopen();
Random rnd(rnd_seed);
std::vector<std::string> source_strings;
// Randomly generate source keys
for (int i = 0; i < 100; i++) {
source_strings.push_back(test::RandomKey(&rnd, 8));
}
DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
}
}
TEST(ComparatorDBTest, TwoStrComparator) {
SetOwnedComparator(new TwoStrComparator());
for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
Options* opt = GetOptions();
opt->comparator = comparator;
DestroyAndReopen();
Random rnd(rnd_seed);
std::vector<std::string> source_strings;
// Randomly generate source keys
for (int i = 0; i < 100; i++) {
std::string str;
uint32_t size1 = rnd.Uniform(8);
uint32_t size2 = rnd.Uniform(8);
str.append(1, static_cast<char>(size1));
str.append(1, static_cast<char>(size2));
str.append(test::RandomKey(&rnd, size1));
str.append(test::RandomKey(&rnd, size2));
source_strings.push_back(str);
}
DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
}
}
} // namespace rocksdb
int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

@ -115,8 +115,8 @@ class CorruptionTest {
continue;
}
missed += (key - next_expected);
next_expected = key + 1;
if (iter->value() != Value(key, &value_space)) {
next_expected = static_cast<unsigned int>(key + 1);
if (iter->value() != Value(static_cast<int>(key), &value_space)) {
bad_values++;
} else {
correct++;
@ -131,7 +131,7 @@ class CorruptionTest {
ASSERT_GE(max_expected, correct);
}
void CorruptFile(const std::string fname, int offset, int bytes_to_corrupt) {
void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) {
struct stat sbuf;
if (stat(fname.c_str(), &sbuf) != 0) {
const char* msg = strerror(errno);
@ -143,14 +143,14 @@ class CorruptionTest {
if (-offset > sbuf.st_size) {
offset = 0;
} else {
offset = sbuf.st_size + offset;
offset = static_cast<int>(sbuf.st_size + offset);
}
}
if (offset > sbuf.st_size) {
offset = sbuf.st_size;
offset = static_cast<int>(sbuf.st_size);
}
if (offset + bytes_to_corrupt > sbuf.st_size) {
bytes_to_corrupt = sbuf.st_size - offset;
bytes_to_corrupt = static_cast<int>(sbuf.st_size - offset);
}
// Do it
@ -177,7 +177,7 @@ class CorruptionTest {
type == filetype &&
static_cast<int>(number) > picked_number) { // Pick latest file
fname = dbname_ + "/" + filenames[i];
picked_number = number;
picked_number = static_cast<int>(number);
}
}
ASSERT_TRUE(!fname.empty()) << filetype;
@ -231,7 +231,9 @@ TEST(CorruptionTest, Recovery) {
Check(100, 100);
Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block
Reopen();
ASSERT_TRUE(!TryReopen().ok());
options_.paranoid_checks = false;
Reopen(&options_);
// The 64 records in the first two log blocks are completely lost.
Check(36, 36);
@ -246,7 +248,8 @@ TEST(CorruptionTest, RecoverWriteError) {
TEST(CorruptionTest, NewFileErrorDuringWrite) {
// Do enough writing to force minor compaction
env_.writable_file_error_ = true;
const int num = 3 + (Options().write_buffer_size / kValueSize);
const int num =
static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
std::string value_storage;
Status s;
bool failed = false;
@ -332,6 +335,9 @@ TEST(CorruptionTest, CorruptedDescriptor) {
}
TEST(CorruptionTest, CompactionInputError) {
Options options;
options.max_background_flushes = 0;
Reopen(&options);
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable();
@ -351,6 +357,7 @@ TEST(CorruptionTest, CompactionInputErrorParanoid) {
options.paranoid_checks = true;
options.write_buffer_size = 131072;
options.max_write_buffer_number = 2;
options.max_background_flushes = 0;
Reopen(&options);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);

@ -92,7 +92,7 @@ class CuckooTableDBTest {
// Return spread of files per level
std::string FilesPerLevel() {
std::string result;
int last_non_zero_offset = 0;
size_t last_non_zero_offset = 0;
for (int level = 0; level < db_->NumberLevels(); level++) {
int f = NumTableFilesAtLevel(level);
char buf[100];
@ -218,6 +218,7 @@ TEST(CuckooTableDBTest, Uint64Comparator) {
// Add more keys.
ASSERT_OK(Delete(Uint64Key(2))); // Delete.
dbfull()->TEST_FlushMemTable();
ASSERT_OK(Put(Uint64Key(3), "v0")); // Update.
ASSERT_OK(Put(Uint64Key(4), "v4"));
dbfull()->TEST_FlushMemTable();
@ -227,28 +228,25 @@ TEST(CuckooTableDBTest, Uint64Comparator) {
ASSERT_EQ("v4", Get(Uint64Key(4)));
}
TEST(CuckooTableDBTest, CompactionTrigger) {
TEST(CuckooTableDBTest, CompactionIntoMultipleFiles) {
// Create a big L0 file and check it compacts into multiple files in L1.
Options options = CurrentOptions();
options.write_buffer_size = 100 << 10; // 100KB
options.level0_file_num_compaction_trigger = 2;
options.write_buffer_size = 270 << 10;
// Two SST files should be created, each containing 14 keys.
// Number of buckets will be 16. Total size ~156 KB.
options.target_file_size_base = 160 << 10;
Reopen(&options);
// Write 11 values, each 10016 B
for (int idx = 0; idx < 11; ++idx) {
// Write 28 values, each 10016 B ~ 10KB
for (int idx = 0; idx < 28; ++idx) {
ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
}
dbfull()->TEST_WaitForFlushMemTable();
ASSERT_EQ("1", FilesPerLevel());
// Generate one more file in level-0, and should trigger level-0 compaction
for (int idx = 11; idx < 22; ++idx) {
ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
}
dbfull()->TEST_WaitForFlushMemTable();
dbfull()->TEST_CompactRange(0, nullptr, nullptr);
ASSERT_EQ("0,2", FilesPerLevel());
for (int idx = 0; idx < 22; ++idx) {
for (int idx = 0; idx < 28; ++idx) {
ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
}
}

File diff suppressed because it is too large Load Diff

@ -9,29 +9,35 @@
#ifndef ROCKSDB_LITE
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <algorithm>
#include <string>
#include <stdint.h>
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/job_context.h"
#include "db/version_set.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "port/port.h"
#include "util/mutexlock.h"
#include "util/sync_point.h"
#include "util/file_util.h"
namespace rocksdb {
Status DBImpl::DisableFileDeletions() {
MutexLock l(&mutex_);
InstrumentedMutexLock l(&mutex_);
++disable_delete_obsolete_files_;
if (disable_delete_obsolete_files_ == 1) {
Log(options_.info_log, "File Deletions Disabled");
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
"File Deletions Disabled");
} else {
Log(options_.info_log,
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
"File Deletions Disabled, but already disabled. Counter: %d",
disable_delete_obsolete_files_);
}
@ -39,10 +45,10 @@ Status DBImpl::DisableFileDeletions() {
}
Status DBImpl::EnableFileDeletions(bool force) {
DeletionState deletion_state;
JobContext job_context;
bool should_purge_files = false;
{
MutexLock l(&mutex_);
InstrumentedMutexLock l(&mutex_);
if (force) {
// if force, we need to enable file deletions right away
disable_delete_obsolete_files_ = 0;
@ -50,19 +56,21 @@ Status DBImpl::EnableFileDeletions(bool force) {
--disable_delete_obsolete_files_;
}
if (disable_delete_obsolete_files_ == 0) {
Log(options_.info_log, "File Deletions Enabled");
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
"File Deletions Enabled");
should_purge_files = true;
FindObsoleteFiles(deletion_state, true);
FindObsoleteFiles(&job_context, true);
} else {
Log(options_.info_log,
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
"File Deletions Enable, but not really enabled. Counter: %d",
disable_delete_obsolete_files_);
}
}
if (should_purge_files) {
PurgeObsoleteFiles(deletion_state);
PurgeObsoleteFiles(job_context);
}
LogFlush(options_.info_log);
job_context.Clean();
LogFlush(db_options_.info_log);
return Status::OK();
}
@ -95,8 +103,8 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
if (!status.ok()) {
mutex_.Unlock();
Log(options_.info_log, "Cannot Flush data %s\n",
status.ToString().c_str());
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
"Cannot Flush data %s\n", status.ToString().c_str());
return status;
}
}
@ -117,65 +125,17 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
}
ret.push_back(CurrentFileName(""));
ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber()));
ret.push_back(DescriptorFileName("", versions_->manifest_file_number()));
// find length of manifest file while holding the mutex lock
*manifest_file_size = versions_->ManifestFileSize();
*manifest_file_size = versions_->manifest_file_size();
mutex_.Unlock();
return Status::OK();
}
Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
// First get sorted files in db dir, then get sorted files from archived
// dir, to avoid a race condition where a log file is moved to archived
// dir in between.
Status s;
// list wal files in main db dir.
VectorLogPtr logs;
s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile);
if (!s.ok()) {
return s;
}
// Reproduce the race condition where a log file is moved
// to archived dir, between these two sync points, used in
// (DBTest,TransactionLogIteratorRace)
TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1");
TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2");
files.clear();
// list wal files in archive dir.
std::string archivedir = ArchivalDirectory(options_.wal_dir);
if (env_->FileExists(archivedir)) {
s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
if (!s.ok()) {
return s;
}
}
uint64_t latest_archived_log_number = 0;
if (!files.empty()) {
latest_archived_log_number = files.back()->LogNumber();
Log(options_.info_log, "Latest Archived log: %" PRIu64,
latest_archived_log_number);
}
files.reserve(files.size() + logs.size());
for (auto& log : logs) {
if (log->LogNumber() > latest_archived_log_number) {
files.push_back(std::move(log));
} else {
// When the race condition happens, we could see the
// same log in both db dir and archived dir. Simply
// ignore the one in db dir. Note that, if we read
// archived dir first, we would have missed the log file.
Log(options_.info_log, "%s already moved to archive",
log->PathName().c_str());
}
}
return s;
return wal_manager_.GetSortedWalFiles(files);
}
}

File diff suppressed because it is too large Load Diff

@ -12,7 +12,9 @@
#include <deque>
#include <limits>
#include <set>
#include <list>
#include <utility>
#include <list>
#include <vector>
#include <string>
@ -21,6 +23,8 @@
#include "db/snapshot.h"
#include "db/column_family.h"
#include "db/version_edit.h"
#include "db/wal_manager.h"
#include "db/writebuffer.h"
#include "memtable_list.h"
#include "port/port.h"
#include "rocksdb/db.h"
@ -30,7 +34,13 @@
#include "util/autovector.h"
#include "util/stop_watch.h"
#include "util/thread_local.h"
#include "util/scoped_arena_iterator.h"
#include "util/hash.h"
#include "util/instrumented_mutex.h"
#include "db/internal_stats.h"
#include "db/write_controller.h"
#include "db/flush_scheduler.h"
#include "db/write_thread.h"
namespace rocksdb {
@ -41,6 +51,7 @@ class VersionEdit;
class VersionSet;
class CompactionFilterV2;
class Arena;
struct JobContext;
class DBImpl : public DB {
public:
@ -108,6 +119,17 @@ class DBImpl : public DB {
bool reduce_level = false, int target_level = -1,
uint32_t target_path_id = 0);
using DB::CompactFiles;
virtual Status CompactFiles(
const CompactionOptions& compact_options,
ColumnFamilyHandle* column_family,
const std::vector<std::string>& input_file_names,
const int output_level, const int output_path_id = -1);
using DB::SetOptions;
Status SetOptions(ColumnFamilyHandle* column_family,
const std::unordered_map<std::string, std::string>& options_map);
using DB::NumberLevels;
virtual int NumberLevels(ColumnFamilyHandle* column_family);
using DB::MaxMemCompactionLevel;
@ -141,6 +163,15 @@ class DBImpl : public DB {
virtual Status DeleteFile(std::string name);
virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
// Obtains the meta data of the specified column family of the DB.
// Status::NotFound() will be returned if the current DB does not have
// any column family match the specified name.
// TODO(yhchiang): output parameter is placed in the end in this codebase.
virtual void GetColumnFamilyMetaData(
ColumnFamilyHandle* column_family,
ColumnFamilyMetaData* metadata) override;
#endif // ROCKSDB_LITE
// checks if all live files exist on file system and that their file sizes
@ -173,8 +204,8 @@ class DBImpl : public DB {
// Return an internal iterator over the current state of the database.
// The keys of this iterator are internal keys (see format.h).
// The returned iterator should be deleted when no longer needed.
Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family =
nullptr);
Iterator* TEST_NewInternalIterator(
Arena* arena, ColumnFamilyHandle* column_family = nullptr);
// Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1.
@ -184,132 +215,79 @@ class DBImpl : public DB {
// Return the current manifest file no.
uint64_t TEST_Current_Manifest_FileNo();
// Trigger's a background call for testing.
void TEST_PurgeObsoleteteWAL();
// get total level0 file size. Only for testing.
uint64_t TEST_GetLevel0TotalSize();
void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
{
default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
}
void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
std::vector<std::vector<FileMetaData>>* metadata);
Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
SequenceNumber* sequence);
void TEST_LockMutex();
Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
#endif // NDEBUG
void TEST_UnlockMutex();
// Structure to store information for candidate files to delete.
struct CandidateFileInfo {
std::string file_name;
uint32_t path_id;
CandidateFileInfo(std::string name, uint32_t path)
: file_name(name), path_id(path) {}
bool operator==(const CandidateFileInfo& other) const {
return file_name == other.file_name && path_id == other.path_id;
}
};
// REQUIRES: mutex locked
void* TEST_BeginWrite();
// needed for CleanupIteratorState
struct DeletionState {
inline bool HaveSomethingToDelete() const {
return candidate_files.size() ||
sst_delete_files.size() ||
log_delete_files.size();
}
// REQUIRES: mutex locked
// pass the pointer that you got from TEST_BeginWrite()
void TEST_EndWrite(void* w);
// a list of all files that we'll consider deleting
// (every once in a while this is filled up with all files
// in the DB directory)
std::vector<CandidateFileInfo> candidate_files;
// the list of all live sst files that cannot be deleted
std::vector<FileDescriptor> sst_live;
// a list of sst files that we need to delete
std::vector<FileMetaData*> sst_delete_files;
// a list of log files that we need to delete
std::vector<uint64_t> log_delete_files;
// a list of memtables to be free
autovector<MemTable*> memtables_to_free;
autovector<SuperVersion*> superversions_to_free;
SuperVersion* new_superversion; // if nullptr no new superversion
// the current manifest_file_number, log_number and prev_log_number
// that corresponds to the set of files in 'live'.
uint64_t manifest_file_number, pending_manifest_file_number, log_number,
prev_log_number;
explicit DeletionState(bool create_superversion = false) {
manifest_file_number = 0;
pending_manifest_file_number = 0;
log_number = 0;
prev_log_number = 0;
new_superversion = create_superversion ? new SuperVersion() : nullptr;
}
uint64_t TEST_max_total_in_memory_state() {
return max_total_in_memory_state_;
}
~DeletionState() {
// free pending memtables
for (auto m : memtables_to_free) {
delete m;
}
// free superversions
for (auto s : superversions_to_free) {
delete s;
}
// if new_superversion was not used, it will be non-nullptr and needs
// to be freed here
delete new_superversion;
}
};
#endif // ROCKSDB_LITE
// Returns the list of live files in 'live' and the list
// of all files in the filesystem in 'candidate_files'.
// If force == false and the last call was less than
// options_.delete_obsolete_files_period_micros microseconds ago,
// it will not fill up the deletion_state
void FindObsoleteFiles(DeletionState& deletion_state,
bool force,
// db_options_.delete_obsolete_files_period_micros microseconds ago,
// it will not fill up the job_context
void FindObsoleteFiles(JobContext* job_context, bool force,
bool no_full_scan = false);
// Diffs the files listed in filenames and those that do not
// belong to live files are posibly removed. Also, removes all the
// files in sst_delete_files and log_delete_files.
// It is not necessary to hold the mutex when invoking this method.
void PurgeObsoleteFiles(DeletionState& deletion_state);
void PurgeObsoleteFiles(const JobContext& background_contet);
ColumnFamilyHandle* DefaultColumnFamily() const;
const SnapshotList& snapshots() const { return snapshots_; }
protected:
Env* const env_;
const std::string dbname_;
unique_ptr<VersionSet> versions_;
const DBOptions options_;
const DBOptions db_options_;
Statistics* stats_;
Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
SuperVersion* super_version,
Arena* arena = nullptr);
SuperVersion* super_version, Arena* arena);
void NotifyOnFlushCompleted(ColumnFamilyData* cfd, uint64_t file_number,
const MutableCFOptions& mutable_cf_options);
void NotifyOnCompactionCompleted(ColumnFamilyData* cfd,
Compaction *c, const Status &st);
void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
void EraseThreadStatusDbInfo() const;
private:
friend class DB;
friend class InternalStats;
#ifndef ROCKSDB_LITE
friend class TailingIterator;
friend class ForwardIterator;
#endif
friend struct SuperVersion;
friend class CompactedDBImpl;
struct CompactionState;
struct Writer;
struct WriteContext;
Status NewDB();
@ -327,14 +305,34 @@ class DBImpl : public DB {
// Delete any unneeded files and stale in-memory entries.
void DeleteObsoleteFiles();
// Background process needs to call
// auto x = CaptureCurrentFileNumberInPendingOutputs()
// <do something>
// ReleaseFileNumberFromPendingOutputs(x)
// This will protect any temporary files created while <do something> is
// executing from being deleted.
// -----------
// This function will capture current file number and append it to
// pending_outputs_. This will prevent any background process to delete any
// file created after this point.
std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
// This function should be called with the result of
// CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
// created between the calls CaptureCurrentFileNumberInPendingOutputs() and
// ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
// and blocked by any other pending_outputs_ calls)
void ReleaseFileNumberFromPendingOutputs(std::list<uint64_t>::iterator v);
// Flush the in-memory write buffer to storage. Switches to a new
// log-file/memtable and writes a new descriptor iff successful.
Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress,
DeletionState& deletion_state,
Status FlushMemTableToOutputFile(ColumnFamilyData* cfd,
const MutableCFOptions& mutable_cf_options,
bool* madeProgress, JobContext* job_context,
LogBuffer* log_buffer);
Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
bool read_only);
// REQUIRES: log_numbers are sorted in ascending order
Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
SequenceNumber* max_sequence, bool read_only);
// The following two methods are used to flush a memtable to
// storage. The first one is used atdatabase RecoveryTime (when the
@ -343,47 +341,13 @@ class DBImpl : public DB {
// concurrent flush memtables to storage.
Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
VersionEdit* edit);
Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
VersionEdit* edit, uint64_t* filenumber,
LogBuffer* log_buffer);
uint64_t SlowdownAmount(int n, double bottom, double top);
// Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
// thread should grab the mutex_ and be the first on writers queue.
// BeginWrite is used for it.
// Be aware! Writer's job can be done by other thread (see DBImpl::Write
// for examples), so check it via w.done before applying changes.
//
// Writer* w: writer to be placed in the queue
// uint64_t expiration_time: maximum time to be in the queue
// See also: EndWrite
Status BeginWrite(Writer* w, uint64_t expiration_time);
// After doing write job, we need to remove already used writers from
// writers_ queue and notify head of the queue about it.
// EndWrite is used for this.
//
// Writer* w: Writer, that was added by BeginWrite function
// Writer* last_writer: Since we can join a few Writers (as DBImpl::Write
// does)
// we should pass last_writer as a parameter to
// EndWrite
// (if you don't touch other writers, just pass w)
// Status status: Status of write operation
// See also: BeginWrite
void EndWrite(Writer* w, Writer* last_writer, Status status);
Status MakeRoomForWrite(ColumnFamilyData* cfd,
WriteContext* context,
uint64_t expiration_time);
Status DelayWrite(uint64_t expiration_time);
Status ScheduleFlushes(WriteContext* context);
Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
WriteContext* context);
void BuildBatchGroup(Writer** last_writer,
autovector<WriteBatch*>* write_batch_group);
// Force current memtable contents to be flushed.
Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
@ -393,98 +357,55 @@ class DBImpl : public DB {
void RecordFlushIOStats();
void RecordCompactionIOStats();
#ifndef ROCKSDB_LITE
Status CompactFilesImpl(
const CompactionOptions& compact_options, ColumnFamilyData* cfd,
Version* version, const std::vector<std::string>& input_file_names,
const int output_level, int output_path_id);
#endif // ROCKSDB_LITE
ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
void MaybeScheduleFlushOrCompaction();
void SchedulePendingFlush(ColumnFamilyData* cfd);
void SchedulePendingCompaction(ColumnFamilyData* cfd);
static void BGWorkCompaction(void* db);
static void BGWorkFlush(void* db);
void BackgroundCallCompaction();
void BackgroundCallFlush();
Status BackgroundCompaction(bool* madeProgress, DeletionState& deletion_state,
Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
LogBuffer* log_buffer);
Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state,
Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
LogBuffer* log_buffer);
void CleanupCompaction(CompactionState* compact, Status status);
Status DoCompactionWork(CompactionState* compact,
DeletionState& deletion_state,
LogBuffer* log_buffer);
// This function is called as part of compaction. It enables Flush process to
// preempt compaction, since it's higher prioirty
// Returns: micros spent executing
uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
DeletionState& deletion_state,
const MutableCFOptions& mutable_cf_options,
JobContext* job_context,
LogBuffer* log_buffer);
// Call compaction filter if is_compaction_v2 is not true. Then iterate
// through input and compact the kv-pairs
Status ProcessKeyValueCompaction(
bool is_snapshot_supported,
SequenceNumber visible_at_tip,
SequenceNumber earliest_snapshot,
SequenceNumber latest_snapshot,
DeletionState& deletion_state,
bool bottommost_level,
int64_t& imm_micros,
Iterator* input,
CompactionState* compact,
bool is_compaction_v2,
LogBuffer* log_buffer);
// Call compaction_filter_v2->Filter() on kv-pairs in compact
void CallCompactionFilterV2(CompactionState* compact,
CompactionFilterV2* compaction_filter_v2);
Status OpenCompactionOutputFile(CompactionState* compact);
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
Status InstallCompactionResults(CompactionState* compact,
LogBuffer* log_buffer);
void AllocateCompactionOutputFileNumbers(CompactionState* compact);
void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
#ifdef ROCKSDB_LITE
void PurgeObsoleteWALFiles() {
// this function is used for archiving WAL files. we don't need this in
// ROCKSDB_LITE
}
#else
void PurgeObsoleteWALFiles();
Status GetSortedWalsOfType(const std::string& path,
VectorLogPtr& log_files,
WalFileType type);
// Requires: all_logs should be sorted with earliest log file first
// Retains all log files in all_logs which contain updates with seq no.
// Greater Than or Equal to the requested SequenceNumber.
Status RetainProbableWalFiles(VectorLogPtr& all_logs,
const SequenceNumber target);
Status ReadFirstRecord(const WalFileType type, const uint64_t number,
SequenceNumber* sequence);
Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
#endif // ROCKSDB_LITE
void PrintStatistics();
// dump rocksdb.stats to LOG
void MaybeDumpStats();
// Return true if the current db supports snapshot. If the current
// DB does not support snapshot, then calling GetSnapshot() will always
// return nullptr.
//
// @see GetSnapshot()
virtual bool IsSnapshotSupported() const;
// Return the minimum empty level that could hold the total data in the
// input level. Return the input level, if such level could not be found.
int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level);
int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
const MutableCFOptions& mutable_cf_options, int level);
// Move the files in the input level to the target level.
// If target_level < 0, automatically calculate the minimum level that could
// hold the data set.
Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
// helper functions for adding and removing from flush & compaction queues
void AddToCompactionQueue(ColumnFamilyData* cfd);
ColumnFamilyData* PopFirstFromCompactionQueue();
void AddToFlushQueue(ColumnFamilyData* cfd);
ColumnFamilyData* PopFirstFromFlushQueue();
// table_cache_ provides its own synchronization
std::shared_ptr<Cache> table_cache_;
@ -492,8 +413,8 @@ class DBImpl : public DB {
FileLock* db_lock_;
// State below is protected by mutex_
port::Mutex mutex_;
port::AtomicPointer shutting_down_;
InstrumentedMutex mutex_;
std::atomic<bool> shutting_down_;
// This condition variable is signaled on these conditions:
// * whenever bg_compaction_scheduled_ goes down to 0
// * if bg_manual_only_ > 0, whenever a compaction finishes, even if it hasn't
@ -502,9 +423,10 @@ class DBImpl : public DB {
// * whenever bg_flush_scheduled_ value decreases (i.e. whenever a flush is
// done, even if it didn't make any progress)
// * whenever there is an error in background flush or compaction
port::CondVar bg_cv_;
InstrumentedCondVar bg_cv_;
uint64_t logfile_number_;
unique_ptr<log::Writer> log_;
bool log_dir_synced_;
bool log_empty_;
ColumnFamilyHandleImpl* default_cf_handle_;
InternalStats* default_cf_internal_stats_;
@ -526,26 +448,85 @@ class DBImpl : public DB {
// some code-paths
bool single_column_family_mode_;
std::unique_ptr<Directory> db_directory_;
bool is_snapshot_supported_;
// Queue of writers.
std::deque<Writer*> writers_;
WriteBatch tmp_batch_;
// Class to maintain directories for all database paths other than main one.
class Directories {
public:
Status SetDirectories(Env* env, const std::string& dbname,
const std::string& wal_dir,
const std::vector<DbPath>& data_paths);
SnapshotList snapshots_;
Directory* GetDataDir(size_t path_id);
Directory* GetWalDir() {
if (wal_dir_) {
return wal_dir_.get();
}
return db_dir_.get();
}
Directory* GetDbDir() { return db_dir_.get(); }
private:
std::unique_ptr<Directory> db_dir_;
std::vector<std::unique_ptr<Directory>> data_dirs_;
std::unique_ptr<Directory> wal_dir_;
// cache for ReadFirstRecord() calls
std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
port::Mutex read_first_record_cache_mutex_;
Status CreateAndNewDirectory(Env* env, const std::string& dirname,
std::unique_ptr<Directory>* directory) const;
};
Directories directories_;
WriteBuffer write_buffer_;
// Set of table files to protect from deletion because they are
// part of ongoing compactions.
// map from pending file number ID to their path IDs.
FileNumToPathIdMap pending_outputs_;
WriteThread write_thread_;
WriteBatch tmp_batch_;
// At least one compaction or flush job is pending but not yet scheduled
// because of the max background thread limit.
bool bg_schedule_needed_;
WriteController write_controller_;
FlushScheduler flush_scheduler_;
SnapshotList snapshots_;
// For each background job, pending_outputs_ keeps the current file number at
// the time that background job started.
// FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
// number bigger than any of the file number in pending_outputs_. Since file
// numbers grow monotonically, this also means that pending_outputs_ is always
// sorted. After a background job is done executing, its file number is
// deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
// it up.
// State is protected with db mutex.
std::list<uint64_t> pending_outputs_;
// flush_queue_ and compaction_queue_ hold column families that we need to
// flush and compact, respectively.
// A column family is inserted into flush_queue_ when it satisfies condition
// cfd->imm()->IsFlushPending()
// A column family is inserted into compaction_queue_ when it satisfied
// condition cfd->NeedsCompaction()
// Column families in this list are all Ref()-erenced
// TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
// do RAII on ColumnFamilyData
// Column families are in this queue when they need to be flushed or
// compacted. Consumers of these queues are flush and compaction threads. When
// column family is put on this queue, we increase unscheduled_flushes_ and
// unscheduled_compactions_. When these variables are bigger than zero, that
// means we need to schedule background threads for compaction and thread.
// Once the background threads are scheduled, we decrease unscheduled_flushes_
// and unscheduled_compactions_. That way we keep track of number of
// compaction and flush threads we need to schedule. This scheduling is done
// in MaybeScheduleFlushOrCompaction()
// invariant(column family present in flush_queue_ <==>
// ColumnFamilyData::pending_flush_ == true)
std::deque<ColumnFamilyData*> flush_queue_;
// invariant(column family present in compaction_queue_ <==>
// ColumnFamilyData::pending_compaction_ == true)
std::deque<ColumnFamilyData*> compaction_queue_;
int unscheduled_flushes_;
int unscheduled_compactions_;
// count how many background compactions are running or have been scheduled
int bg_compaction_scheduled_;
@ -584,30 +565,23 @@ class DBImpl : public DB {
// without any synchronization
int disable_delete_obsolete_files_;
// last time when DeleteObsoleteFiles was invoked
uint64_t delete_obsolete_files_last_run_;
// last time when PurgeObsoleteWALFiles ran.
uint64_t purge_wal_files_last_run_;
// next time when we should run DeleteObsoleteFiles with full scan
uint64_t delete_obsolete_files_next_run_;
// last time stats were dumped to LOG
std::atomic<uint64_t> last_stats_dump_time_microsec_;
// obsolete files will be deleted every this seconds if ttl deletion is
// enabled and archive size_limit is disabled.
uint64_t default_interval_to_delete_obsolete_WAL_;
bool flush_on_destroy_; // Used when disableWAL is true.
static const int KEEP_LOG_FILE_NUM = 1000;
static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
std::string db_absolute_path_;
// count of the number of contiguous delaying writes
int delayed_writes_;
// The options to access storage files
const EnvOptions storage_options_;
const EnvOptions env_options_;
#ifndef ROCKSDB_LITE
WalManager wal_manager_;
#endif // ROCKSDB_LITE
// A value of true temporarily disables scheduling of background work
bool bg_work_gate_closed_;
@ -618,13 +592,16 @@ class DBImpl : public DB {
// Indicate DB was opened successfully
bool opened_successfully_;
// The list of registered event listeners.
std::list<EventListener*> listeners_;
// count how many events are currently being notified.
int notifying_events_;
// No copying allowed
DBImpl(const DBImpl&);
void operator=(const DBImpl&);
// dump the delayed_writes_ to the log file and reset counter.
void DelayLoggingAndReset();
// Return the earliest snapshot where seqno is visible.
// Store the snapshot right before that, if any, in prev_snapshot
inline SequenceNumber findEarliestVisibleSnapshot(
@ -633,10 +610,24 @@ class DBImpl : public DB {
SequenceNumber* prev_snapshot);
// Background threads call this function, which is just a wrapper around
// the cfd->InstallSuperVersion() function. Background threads carry
// deletion_state which can have new_superversion already allocated.
void InstallSuperVersion(ColumnFamilyData* cfd,
DeletionState& deletion_state);
// the InstallSuperVersion() function. Background threads carry
// job_context which can have new_superversion already
// allocated.
void InstallSuperVersionBackground(
ColumnFamilyData* cfd, JobContext* job_context,
const MutableCFOptions& mutable_cf_options);
// All ColumnFamily state changes go through this function. Here we analyze
// the new state and we schedule background work if we detect that the new
// state needs flush or compaction.
// If dont_schedule_bg_work == true, then caller asks us to not schedule flush
// or compaction here, but it also promises to schedule needed background
// work. We use this to scheduling background compactions when we are in the
// write thread, which is very performance critical. Caller schedules
// background work as soon as it exits the write thread
SuperVersion* InstallSuperVersion(ColumnFamilyData* cfd, SuperVersion* new_sv,
const MutableCFOptions& mutable_cf_options,
bool dont_schedule_bg_work = false);
// Find Super version and reference it. Based on options, it might return
// the thread local cached one.
@ -680,8 +671,4 @@ static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
}
// Dump db file summary, implemented in util/
extern void DumpDBFileSummary(const DBOptions& options,
const std::string& dbname);
} // namespace rocksdb

@ -10,17 +10,17 @@
#ifndef ROCKSDB_LITE
#include "db/db_impl.h"
#include "util/thread_status_updater.h"
namespace rocksdb {
void DBImpl::TEST_PurgeObsoleteteWAL() { PurgeObsoleteWALFiles(); }
uint64_t DBImpl::TEST_GetLevel0TotalSize() {
MutexLock l(&mutex_);
return default_cf_handle_->cfd()->current()->NumLevelBytes(0);
InstrumentedMutexLock l(&mutex_);
return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
}
Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena,
ColumnFamilyHandle* column_family) {
ColumnFamilyData* cfd;
if (column_family == nullptr) {
cfd = default_cf_handle_->cfd();
@ -33,7 +33,7 @@ Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
mutex_.Unlock();
ReadOptions roptions;
return NewInternalIterator(roptions, cfd, super_version);
return NewInternalIterator(roptions, cfd, super_version, arena);
}
int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
@ -45,8 +45,8 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
cfd = cfh->cfd();
}
MutexLock l(&mutex_);
return cfd->current()->MaxNextLevelOverlappingBytes();
InstrumentedMutexLock l(&mutex_);
return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes();
}
void DBImpl::TEST_GetFilesMetaData(
@ -54,10 +54,11 @@ void DBImpl::TEST_GetFilesMetaData(
std::vector<std::vector<FileMetaData>>* metadata) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
MutexLock l(&mutex_);
InstrumentedMutexLock l(&mutex_);
metadata->resize(NumberLevels());
for (int level = 0; level < NumberLevels(); level++) {
const std::vector<FileMetaData*>& files = cfd->current()->files_[level];
const std::vector<FileMetaData*>& files =
cfd->current()->storage_info()->LevelFiles(level);
(*metadata)[level].clear();
for (const auto& f : files) {
@ -67,7 +68,7 @@ void DBImpl::TEST_GetFilesMetaData(
}
uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
return versions_->ManifestFileNumber();
return versions_->manifest_file_number();
}
Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
@ -81,8 +82,8 @@ Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
cfd = cfh->cfd();
}
int output_level =
(cfd->options()->compaction_style == kCompactionStyleUniversal ||
cfd->options()->compaction_style == kCompactionStyleFIFO)
(cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
cfd->ioptions()->compaction_style == kCompactionStyleFIFO)
? level
: level + 1;
return RunManualCompaction(cfd, level, output_level, 0, begin, end);
@ -112,22 +113,33 @@ Status DBImpl::TEST_WaitForCompact() {
// wait for compact. It actually waits for scheduled compaction
// OR flush to finish.
MutexLock l(&mutex_);
InstrumentedMutexLock l(&mutex_);
while ((bg_compaction_scheduled_ || bg_flush_scheduled_) && bg_error_.ok()) {
bg_cv_.Wait();
}
return bg_error_;
}
Status DBImpl::TEST_ReadFirstRecord(const WalFileType type,
const uint64_t number,
SequenceNumber* sequence) {
return ReadFirstRecord(type, number, sequence);
void DBImpl::TEST_LockMutex() {
mutex_.Lock();
}
void DBImpl::TEST_UnlockMutex() {
mutex_.Unlock();
}
void* DBImpl::TEST_BeginWrite() {
auto w = new WriteThread::Writer(&mutex_);
Status s = write_thread_.EnterWriteThread(w, 0);
assert(s.ok() && !w->done); // No timeout and nobody should do our job
return reinterpret_cast<void*>(w);
}
Status DBImpl::TEST_ReadFirstLine(const std::string& fname,
SequenceNumber* sequence) {
return ReadFirstLine(fname, sequence);
void DBImpl::TEST_EndWrite(void* w) {
auto writer = reinterpret_cast<WriteThread::Writer*>(w);
write_thread_.ExitWriteThread(writer, writer, Status::OK());
delete writer;
}
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -2,56 +2,31 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2012 Facebook. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "db/db_impl_readonly.h"
#include "utilities/compacted_db/compacted_db_impl.h"
#include "db/db_impl.h"
#include <algorithm>
#include <set>
#include <string>
#include <stdint.h>
#include <stdio.h>
#include <vector>
#include "db/db_iter.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/merge_context.h"
#include "db/table_cache.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
#include "rocksdb/merge_operator.h"
#include "port/port.h"
#include "table/block.h"
#include "table/merger.h"
#include "table/two_level_iterator.h"
#include "util/coding.h"
#include "util/logging.h"
#include "util/build_version.h"
#include "db/db_iter.h"
#include "util/perf_context_imp.h"
namespace rocksdb {
DBImplReadOnly::DBImplReadOnly(const DBOptions& options,
#ifndef ROCKSDB_LITE
DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
const std::string& dbname)
: DBImpl(options, dbname) {
Log(options_.info_log, "Opening the db in read only mode");
: DBImpl(db_options, dbname) {
Log(INFO_LEVEL, db_options_.info_log, "Opening the db in read only mode");
LogFlush(db_options_.info_log);
}
DBImplReadOnly::~DBImplReadOnly() {
}
// Implementations of the DB interface
Status DBImplReadOnly::Get(const ReadOptions& options,
Status DBImplReadOnly::Get(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) {
Status s;
@ -61,33 +36,35 @@ Status DBImplReadOnly::Get(const ReadOptions& options,
SuperVersion* super_version = cfd->GetSuperVersion();
MergeContext merge_context;
LookupKey lkey(key, snapshot);
if (super_version->mem->Get(lkey, value, &s, merge_context,
*cfd->options())) {
if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
} else {
super_version->current->Get(options, lkey, value, &s, &merge_context);
PERF_TIMER_GUARD(get_from_output_files_time);
super_version->current->Get(read_options, lkey, value, &s, &merge_context);
}
return s;
}
Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options,
Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
ColumnFamilyHandle* column_family) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
SequenceNumber latest_snapshot = versions_->LastSequence();
auto db_iter = NewArenaWrappedDbIterator(
env_, *cfd->options(), cfd->user_comparator(),
(options.snapshot != nullptr
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
: latest_snapshot));
auto internal_iter =
NewInternalIterator(options, cfd, super_version, db_iter->GetArena());
env_, *cfd->ioptions(), cfd->user_comparator(),
(read_options.snapshot != nullptr
? reinterpret_cast<const SnapshotImpl*>(
read_options.snapshot)->number_
: latest_snapshot),
super_version->mutable_cf_options.max_sequential_skip_in_iterations);
auto internal_iter = NewInternalIterator(
read_options, cfd, super_version, db_iter->GetArena());
db_iter->SetIterUnderDBIter(internal_iter);
return db_iter;
}
Status DBImplReadOnly::NewIterators(
const ReadOptions& options,
const ReadOptions& read_options,
const std::vector<ColumnFamilyHandle*>& column_families,
std::vector<Iterator*>* iterators) {
if (iterators == nullptr) {
@ -98,14 +75,17 @@ Status DBImplReadOnly::NewIterators(
SequenceNumber latest_snapshot = versions_->LastSequence();
for (auto cfh : column_families) {
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
auto db_iter = NewArenaWrappedDbIterator(
env_, *cfd->options(), cfd->user_comparator(),
options.snapshot != nullptr
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
: latest_snapshot);
auto internal_iter = NewInternalIterator(
options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena());
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
auto* sv = cfd->GetSuperVersion()->Ref();
auto* db_iter = NewArenaWrappedDbIterator(
env_, *cfd->ioptions(), cfd->user_comparator(),
(read_options.snapshot != nullptr
? reinterpret_cast<const SnapshotImpl*>(
read_options.snapshot)->number_
: latest_snapshot),
sv->mutable_cf_options.max_sequential_skip_in_iterations);
auto* internal_iter = NewInternalIterator(
read_options, cfd, sv, db_iter->GetArena());
db_iter->SetIterUnderDBIter(internal_iter);
iterators->push_back(db_iter);
}
@ -117,6 +97,13 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
DB** dbptr, bool error_if_log_file_exist) {
*dbptr = nullptr;
// Try to first open DB as fully compacted DB
Status s;
s = CompactedDBImpl::Open(options, dbname, dbptr);
if (s.ok()) {
return s;
}
DBOptions db_options(options);
ColumnFamilyOptions cf_options(options);
std::vector<ColumnFamilyDescriptor> column_families;
@ -124,8 +111,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
std::vector<ColumnFamilyHandle*> handles;
Status s =
DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
s = DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
if (s.ok()) {
assert(handles.size() == 1);
// i can delete the handle since DBImpl is always holding a
@ -167,6 +153,10 @@ Status DB::OpenForReadOnly(
impl->mutex_.Unlock();
if (s.ok()) {
*dbptr = impl;
for (auto* h : *handles) {
impl->NewThreadStatusCfInfo(
reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
}
} else {
for (auto h : *handles) {
delete h;
@ -177,5 +167,20 @@ Status DB::OpenForReadOnly(
return s;
}
#else // !ROCKSDB_LITE
Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
DB** dbptr, bool error_if_log_file_exist) {
return Status::NotSupported("Not supported in ROCKSDB_LITE.");
}
Status DB::OpenForReadOnly(
const DBOptions& db_options, const std::string& dbname,
const std::vector<ColumnFamilyDescriptor>& column_families,
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
bool error_if_log_file_exist) {
return Status::NotSupported("Not supported in ROCKSDB_LITE.");
}
#endif // !ROCKSDB_LITE
} // namespace rocksdb

@ -2,24 +2,14 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2012 Facebook. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#pragma once
#include "db/db_impl.h"
#include <deque>
#include <set>
#ifndef ROCKSDB_LITE
#include "db/db_impl.h"
#include <vector>
#include <string>
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "port/port.h"
namespace rocksdb {
@ -75,10 +65,20 @@ class DBImplReadOnly : public DBImpl {
return Status::NotSupported("Not supported operation in read only mode.");
}
using DBImpl::CompactFiles;
virtual Status CompactFiles(
const CompactionOptions& compact_options,
ColumnFamilyHandle* column_family,
const std::vector<std::string>& input_file_names,
const int output_level, const int output_path_id = -1) override {
return Status::NotSupported("Not supported operation in read only mode.");
}
#ifndef ROCKSDB_LITE
virtual Status DisableFileDeletions() override {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status EnableFileDeletions(bool force) override {
return Status::NotSupported("Not supported operation in read only mode.");
}
@ -103,3 +103,5 @@ class DBImplReadOnly : public DBImpl {
void operator=(const DBImplReadOnly&);
};
}
#endif // !ROCKSDB_LITE

@ -58,22 +58,25 @@ class DBIter: public Iterator {
kReverse
};
DBIter(Env* env, const Options& options, const Comparator* cmp,
Iterator* iter, SequenceNumber s, bool arena_mode)
DBIter(Env* env, const ImmutableCFOptions& ioptions,
const Comparator* cmp, Iterator* iter, SequenceNumber s,
bool arena_mode, uint64_t max_sequential_skip_in_iterations,
const Slice* iterate_upper_bound = nullptr)
: arena_mode_(arena_mode),
env_(env),
logger_(options.info_log.get()),
logger_(ioptions.info_log),
user_comparator_(cmp),
user_merge_operator_(options.merge_operator.get()),
user_merge_operator_(ioptions.merge_operator),
iter_(iter),
sequence_(s),
direction_(kForward),
valid_(false),
current_entry_is_merged_(false),
statistics_(options.statistics.get()) {
statistics_(ioptions.statistics),
iterate_upper_bound_(iterate_upper_bound) {
RecordTick(statistics_, NO_ITERATORS);
has_prefix_extractor_ = (options.prefix_extractor.get() != nullptr);
max_skip_ = options.max_sequential_skip_in_iterations;
prefix_extractor_ = ioptions.prefix_extractor;
max_skip_ = max_sequential_skip_in_iterations;
}
virtual ~DBIter() {
RecordTick(statistics_, NO_ITERATORS, -1);
@ -132,7 +135,7 @@ class DBIter: public Iterator {
}
}
bool has_prefix_extractor_;
const SliceTransform* prefix_extractor_;
bool arena_mode_;
Env* const env_;
Logger* logger_;
@ -149,6 +152,7 @@ class DBIter: public Iterator {
bool current_entry_is_merged_;
Statistics* statistics_;
uint64_t max_skip_;
const Slice* iterate_upper_bound_;
// No copying allowed
DBIter(const DBIter&);
@ -158,7 +162,8 @@ class DBIter: public Iterator {
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
if (!ParseInternalKey(iter_->key(), ikey)) {
status_ = Status::Corruption("corrupted internal key in DBIter");
Log(logger_, "corrupted internal key in DBIter: %s",
Log(InfoLogLevel::ERROR_LEVEL,
logger_, "corrupted internal key in DBIter: %s",
iter_->key().ToString(true).c_str());
return false;
} else {
@ -194,9 +199,8 @@ void DBIter::Next() {
// NOTE: In between, saved_key_ can point to a user key that has
// a delete marker
inline void DBIter::FindNextUserEntry(bool skipping) {
PERF_TIMER_AUTO(find_next_user_entry_time);
PERF_TIMER_GUARD(find_next_user_entry_time);
FindNextUserEntryInternal(skipping);
PERF_TIMER_STOP(find_next_user_entry_time);
}
// Actual implementation of DBIter::FindNextUserEntry()
@ -208,36 +212,44 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
uint64_t num_skipped = 0;
do {
ParsedInternalKey ikey;
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
if (skipping &&
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
num_skipped++; // skip this entry
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
} else {
skipping = false;
switch (ikey.type) {
case kTypeDeletion:
// Arrange to skip all upcoming entries for this key since
// they are hidden by this deletion.
saved_key_.SetKey(ikey.user_key);
skipping = true;
num_skipped = 0;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
break;
case kTypeValue:
valid_ = true;
saved_key_.SetKey(ikey.user_key);
return;
case kTypeMerge:
// By now, we are sure the current ikey is going to yield a value
saved_key_.SetKey(ikey.user_key);
current_entry_is_merged_ = true;
valid_ = true;
MergeValuesNewToOld(); // Go to a different state machine
return;
default:
assert(false);
break;
if (ParseKey(&ikey)) {
if (iterate_upper_bound_ != nullptr &&
ikey.user_key.compare(*iterate_upper_bound_) >= 0) {
break;
}
if (ikey.sequence <= sequence_) {
if (skipping &&
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
num_skipped++; // skip this entry
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
} else {
skipping = false;
switch (ikey.type) {
case kTypeDeletion:
// Arrange to skip all upcoming entries for this key since
// they are hidden by this deletion.
saved_key_.SetKey(ikey.user_key);
skipping = true;
num_skipped = 0;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
break;
case kTypeValue:
valid_ = true;
saved_key_.SetKey(ikey.user_key);
return;
case kTypeMerge:
// By now, we are sure the current ikey is going to yield a value
saved_key_.SetKey(ikey.user_key);
current_entry_is_merged_ = true;
valid_ = true;
MergeValuesNewToOld(); // Go to a different state machine
return;
default:
assert(false);
break;
}
}
}
}
@ -267,16 +279,17 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
// iter_ points to the next entry (or invalid)
void DBIter::MergeValuesNewToOld() {
if (!user_merge_operator_) {
Log(logger_, "Options::merge_operator is null.");
throw std::logic_error("DBIter::MergeValuesNewToOld() with"
" Options::merge_operator null");
Log(InfoLogLevel::ERROR_LEVEL,
logger_, "Options::merge_operator is null.");
status_ = Status::InvalidArgument("user_merge_operator_ must be set.");
valid_ = false;
return;
}
// Start the merge process by pushing the first operand
std::deque<std::string> operands;
operands.push_front(iter_->value().ToString());
std::string merge_result; // Temporary string to hold merge result later
ParsedInternalKey ikey;
for (iter_->Next(); iter_->Valid(); iter_->Next()) {
if (!ParseKey(&ikey)) {
@ -300,8 +313,8 @@ void DBIter::MergeValuesNewToOld() {
// hit a put, merge the put value with operands and store the
// final result in saved_value_. We are done!
// ignore corruption if there is any.
const Slice value = iter_->value();
user_merge_operator_->FullMerge(ikey.user_key, &value, operands,
const Slice val = iter_->value();
user_merge_operator_->FullMerge(ikey.user_key, &val, operands,
&saved_value_, logger_);
// iter_ is positioned after put
iter_->Next();
@ -311,8 +324,8 @@ void DBIter::MergeValuesNewToOld() {
if (kTypeMerge == ikey.type) {
// hit a merge, add the value as an operand and run associative merge.
// when complete, add result to operands and continue.
const Slice& value = iter_->value();
operands.push_front(value.ToString());
const Slice& val = iter_->value();
operands.push_front(val.ToString());
}
}
@ -399,6 +412,7 @@ bool DBIter::FindValueForCurrentKey() {
case kTypeDeletion:
operands.clear();
last_not_merge_type = kTypeDeletion;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
break;
case kTypeMerge:
assert(user_merge_operator_ != nullptr);
@ -408,6 +422,7 @@ bool DBIter::FindValueForCurrentKey() {
assert(false);
}
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0);
iter_->Prev();
++num_skipped;
@ -491,8 +506,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
return true;
}
const Slice& value = iter_->value();
user_merge_operator_->FullMerge(saved_key_.GetKey(), &value, operands,
const Slice& val = iter_->value();
user_merge_operator_->FullMerge(saved_key_.GetKey(), &val, operands,
&saved_value_, logger_);
valid_ = true;
return true;
@ -554,12 +569,29 @@ void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) {
void DBIter::Seek(const Slice& target) {
StopWatch sw(env_, statistics_, DB_SEEK);
// total ordering is not guaranteed if prefix_extractor is set
// hence prefix based seeks will not give correct results
if (iterate_upper_bound_ != nullptr && prefix_extractor_ != nullptr) {
if (!prefix_extractor_->InDomain(*iterate_upper_bound_) ||
!prefix_extractor_->InDomain(target) ||
prefix_extractor_->Transform(*iterate_upper_bound_).compare(
prefix_extractor_->Transform(target)) != 0) {
status_ = Status::InvalidArgument("read_options.iterate_*_bound "
" and seek target need to have the same prefix.");
valid_ = false;
return;
}
}
saved_key_.Clear();
// now savved_key is used to store internal key.
saved_key_.SetInternalKey(target, sequence_);
PERF_TIMER_AUTO(seek_internal_seek_time);
iter_->Seek(saved_key_.GetKey());
PERF_TIMER_STOP(seek_internal_seek_time);
{
PERF_TIMER_GUARD(seek_internal_seek_time);
iter_->Seek(saved_key_.GetKey());
}
if (iter_->Valid()) {
direction_ = kForward;
ClearSavedValue();
@ -572,14 +604,17 @@ void DBIter::Seek(const Slice& target) {
void DBIter::SeekToFirst() {
// Don't use iter_::Seek() if we set a prefix extractor
// because prefix seek wiil be used.
if (has_prefix_extractor_) {
if (prefix_extractor_ != nullptr) {
max_skip_ = std::numeric_limits<uint64_t>::max();
}
direction_ = kForward;
ClearSavedValue();
PERF_TIMER_AUTO(seek_internal_seek_time);
iter_->SeekToFirst();
PERF_TIMER_STOP(seek_internal_seek_time);
{
PERF_TIMER_GUARD(seek_internal_seek_time);
iter_->SeekToFirst();
}
if (iter_->Valid()) {
FindNextUserEntry(false /* not skipping */);
} else {
@ -590,24 +625,29 @@ void DBIter::SeekToFirst() {
void DBIter::SeekToLast() {
// Don't use iter_::Seek() if we set a prefix extractor
// because prefix seek wiil be used.
if (has_prefix_extractor_) {
if (prefix_extractor_ != nullptr) {
max_skip_ = std::numeric_limits<uint64_t>::max();
}
direction_ = kReverse;
ClearSavedValue();
PERF_TIMER_AUTO(seek_internal_seek_time);
iter_->SeekToLast();
PERF_TIMER_STOP(seek_internal_seek_time);
{
PERF_TIMER_GUARD(seek_internal_seek_time);
iter_->SeekToLast();
}
PrevInternal();
}
Iterator* NewDBIterator(Env* env, const Options& options,
Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions,
const Comparator* user_key_comparator,
Iterator* internal_iter,
const SequenceNumber& sequence) {
return new DBIter(env, options, user_key_comparator, internal_iter, sequence,
false);
const SequenceNumber& sequence,
uint64_t max_sequential_skip_in_iterations,
const Slice* iterate_upper_bound) {
return new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence,
false, max_sequential_skip_in_iterations,
iterate_upper_bound);
}
ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
@ -635,14 +675,20 @@ void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
}
ArenaWrappedDBIter* NewArenaWrappedDbIterator(
Env* env, const Options& options, const Comparator* user_key_comparator,
const SequenceNumber& sequence) {
Env* env, const ImmutableCFOptions& ioptions,
const Comparator* user_key_comparator,
const SequenceNumber& sequence,
uint64_t max_sequential_skip_in_iterations,
const Slice* iterate_upper_bound) {
ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
Arena* arena = iter->GetArena();
auto mem = arena->AllocateAligned(sizeof(DBIter));
DBIter* db_iter = new (mem)
DBIter(env, options, user_key_comparator, nullptr, sequence, true);
DBIter* db_iter = new (mem) DBIter(env, ioptions, user_key_comparator,
nullptr, sequence, true, max_sequential_skip_in_iterations,
iterate_upper_bound);
iter->SetDBIter(db_iter);
return iter;
}

@ -24,10 +24,12 @@ class DBIter;
// into appropriate user keys.
extern Iterator* NewDBIterator(
Env* env,
const Options& options,
const ImmutableCFOptions& options,
const Comparator *user_key_comparator,
Iterator* internal_iter,
const SequenceNumber& sequence);
const SequenceNumber& sequence,
uint64_t max_sequential_skip_in_iterations,
const Slice* iterate_upper_bound = nullptr);
// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
// iterator is supposed be allocated. This class is used as an entry point of
@ -67,7 +69,9 @@ class ArenaWrappedDBIter : public Iterator {
// Generate the arena wrapped iterator class.
extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
Env* env, const Options& options, const Comparator* user_key_comparator,
const SequenceNumber& sequence);
Env* env, const ImmutableCFOptions& options,
const Comparator* user_key_comparator,
const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
const Slice* iterate_upper_bound = nullptr);
} // namespace rocksdb

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -127,8 +127,8 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
}
}
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
size_t usize = user_key.size();
LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) {
size_t usize = _user_key.size();
size_t needed = usize + 13; // A conservative estimate
char* dst;
if (needed <= sizeof(space_)) {
@ -137,9 +137,10 @@ LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
dst = new char[needed];
}
start_ = dst;
dst = EncodeVarint32(dst, usize + 8);
// NOTE: We don't support users keys of more than 2GB :)
dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + 8));
kstart_ = dst;
memcpy(dst, user_key.data(), usize);
memcpy(dst, _user_key.data(), usize);
dst += usize;
EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
dst += 8;

@ -132,8 +132,8 @@ class InternalKey {
std::string rep_;
public:
InternalKey() { } // Leave rep_ as empty to indicate it is invalid
InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) {
AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t));
}
bool Valid() const {
@ -201,18 +201,24 @@ class LookupKey {
public:
// Initialize *this for looking up user_key at a snapshot with
// the specified sequence number.
LookupKey(const Slice& user_key, SequenceNumber sequence);
LookupKey(const Slice& _user_key, SequenceNumber sequence);
~LookupKey();
// Return a key suitable for lookup in a MemTable.
Slice memtable_key() const { return Slice(start_, end_ - start_); }
Slice memtable_key() const {
return Slice(start_, static_cast<size_t>(end_ - start_));
}
// Return an internal key (suitable for passing to an internal iterator)
Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
Slice internal_key() const {
return Slice(kstart_, static_cast<size_t>(end_ - kstart_));
}
// Return the user key
Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
Slice user_key() const {
return Slice(kstart_, static_cast<size_t>(end_ - kstart_ - 8));
}
private:
// We construct a char array of the form:
@ -244,7 +250,7 @@ class IterKey {
Slice GetKey() const { return Slice(key_, key_size_); }
const size_t Size() { return key_size_; }
size_t Size() { return key_size_; }
void Clear() { key_size_ = 0; }
@ -319,8 +325,8 @@ class IterKey {
void EncodeLengthPrefixedKey(const Slice& key) {
auto size = key.size();
EnlargeBufferIfNeeded(size + VarintLength(size));
char* ptr = EncodeVarint32(key_, size);
EnlargeBufferIfNeeded(size + static_cast<size_t>(VarintLength(size)));
char* ptr = EncodeVarint32(key_, static_cast<uint32_t>(size));
memcpy(ptr, key.data(), size);
}

@ -34,6 +34,8 @@ class DeleteFileTest {
DeleteFileTest() {
db_ = nullptr;
env_ = Env::Default();
options_.enable_thread_tracking = true;
options_.max_background_flushes = 0;
options_.write_buffer_size = 1024*1024*1000;
options_.target_file_size_base = 1024*1024*1000;
options_.max_bytes_for_level_base = 1024*1024*1000;
@ -77,7 +79,7 @@ class DeleteFileTest {
options.sync = false;
ReadOptions roptions;
for (int i = startkey; i < (numkeys + startkey) ; i++) {
std::string temp = std::to_string(i);
std::string temp = ToString(i);
Slice key(temp);
Slice value(temp);
ASSERT_OK(db_->Put(options, key, value));
@ -147,7 +149,6 @@ class DeleteFileTest {
TEST(DeleteFileTest, AddKeysAndQueryLevels) {
CreateTwoLevels();
std::vector<LiveFileMetaData> metadata;
std::vector<int> keysinlevel;
db_->GetLiveFilesMetaData(&metadata);
std::string level1file = "";
@ -287,6 +288,75 @@ TEST(DeleteFileTest, DeleteLogFiles) {
CloseDB();
}
TEST(DeleteFileTest, DeleteNonDefaultColumnFamily) {
CloseDB();
DBOptions db_options;
db_options.create_if_missing = true;
db_options.create_missing_column_families = true;
std::vector<ColumnFamilyDescriptor> column_families;
column_families.emplace_back();
column_families.emplace_back("new_cf", ColumnFamilyOptions());
std::vector<rocksdb::ColumnFamilyHandle*> handles;
rocksdb::DB* db;
ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db));
Random rnd(5);
for (int i = 0; i < 1000; ++i) {
ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10),
test::RandomKey(&rnd, 10)));
}
ASSERT_OK(db->Flush(FlushOptions(), handles[1]));
for (int i = 0; i < 1000; ++i) {
ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10),
test::RandomKey(&rnd, 10)));
}
ASSERT_OK(db->Flush(FlushOptions(), handles[1]));
std::vector<LiveFileMetaData> metadata;
db->GetLiveFilesMetaData(&metadata);
ASSERT_EQ(2U, metadata.size());
ASSERT_EQ("new_cf", metadata[0].column_family_name);
ASSERT_EQ("new_cf", metadata[1].column_family_name);
auto old_file = metadata[0].smallest_seqno < metadata[1].smallest_seqno
? metadata[0].name
: metadata[1].name;
auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno
? metadata[0].name
: metadata[1].name;
ASSERT_TRUE(db->DeleteFile(new_file).IsInvalidArgument());
ASSERT_OK(db->DeleteFile(old_file));
{
std::unique_ptr<Iterator> itr(db->NewIterator(ReadOptions(), handles[1]));
int count = 0;
for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
ASSERT_OK(itr->status());
++count;
}
ASSERT_EQ(count, 1000);
}
delete handles[0];
delete handles[1];
delete db;
ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db));
{
std::unique_ptr<Iterator> itr(db->NewIterator(ReadOptions(), handles[1]));
int count = 0;
for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
ASSERT_OK(itr->status());
++count;
}
ASSERT_EQ(count, 1000);
}
delete handles[0];
delete handles[1];
delete db;
}
} //namespace rocksdb
int main(int argc, char** argv) {

@ -0,0 +1,742 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright 2014 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
// This test uses a custom Env to keep track of the state of a filesystem as of
// the last "sync". It then checks for data loss errors by purposely dropping
// file data (or entire files) not protected by a "sync".
#include <map>
#include <set>
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/log_format.h"
#include "db/version_set.h"
#include "rocksdb/cache.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/table.h"
#include "rocksdb/write_batch.h"
#include "util/logging.h"
#include "util/mock_env.h"
#include "util/mutexlock.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace rocksdb {
static const int kValueSize = 1000;
static const int kMaxNumValues = 2000;
static const size_t kNumIterations = 3;
class TestWritableFile;
class FaultInjectionTestEnv;
namespace {
// Assume a filename, and not a directory name like "/foo/bar/"
static std::string GetDirName(const std::string filename) {
size_t found = filename.find_last_of("/\\");
if (found == std::string::npos) {
return "";
} else {
return filename.substr(0, found);
}
}
// Trim the tailing "/" in the end of `str`
static std::string TrimDirname(const std::string& str) {
size_t found = str.find_last_not_of("/");
if (found == std::string::npos) {
return str;
}
return str.substr(0, found + 1);
}
// Return pair <parent directory name, file name> of a full path.
static std::pair<std::string, std::string> GetDirAndName(
const std::string& name) {
std::string dirname = GetDirName(name);
std::string fname = name.substr(dirname.size() + 1);
return std::make_pair(dirname, fname);
}
// A basic file truncation function suitable for this test.
Status Truncate(Env* env, const std::string& filename, uint64_t length) {
unique_ptr<SequentialFile> orig_file;
const EnvOptions options;
Status s = env->NewSequentialFile(filename, &orig_file, options);
if (!s.ok()) {
fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
s.ToString().c_str());
return s;
}
char* scratch = new char[length];
rocksdb::Slice result;
s = orig_file->Read(length, &result, scratch);
if (s.ok()) {
std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
unique_ptr<WritableFile> tmp_file;
s = env->NewWritableFile(tmp_name, &tmp_file, options);
if (s.ok()) {
s = tmp_file->Append(result);
if (s.ok()) {
s = env->RenameFile(tmp_name, filename);
} else {
fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(),
filename.c_str(), s.ToString().c_str());
env->DeleteFile(tmp_name);
}
}
}
if (!s.ok()) {
fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
s.ToString().c_str());
}
delete[] scratch;
return s;
}
struct FileState {
std::string filename_;
ssize_t pos_;
ssize_t pos_at_last_sync_;
ssize_t pos_at_last_flush_;
explicit FileState(const std::string& filename)
: filename_(filename),
pos_(-1),
pos_at_last_sync_(-1),
pos_at_last_flush_(-1) { }
FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
Status DropUnsyncedData(Env* env) const;
Status DropRandomUnsyncedData(Env* env, Random* rand) const;
};
} // anonymous namespace
// A wrapper around WritableFile which informs another Env whenever this file
// is written to or sync'ed.
class TestWritableFile : public WritableFile {
public:
explicit TestWritableFile(const std::string& fname,
unique_ptr<WritableFile>&& f,
FaultInjectionTestEnv* env);
virtual ~TestWritableFile();
virtual Status Append(const Slice& data);
virtual Status Close();
virtual Status Flush();
virtual Status Sync();
private:
FileState state_;
unique_ptr<WritableFile> target_;
bool writable_file_opened_;
FaultInjectionTestEnv* env_;
};
class TestDirectory : public Directory {
public:
explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname,
Directory* dir)
: env_(env), dirname_(dirname), dir_(dir) {}
~TestDirectory() {}
virtual Status Fsync() override;
private:
FaultInjectionTestEnv* env_;
std::string dirname_;
unique_ptr<Directory> dir_;
};
class FaultInjectionTestEnv : public EnvWrapper {
public:
explicit FaultInjectionTestEnv(Env* base)
: EnvWrapper(base),
filesystem_active_(true) {}
virtual ~FaultInjectionTestEnv() { }
Status NewDirectory(const std::string& name,
unique_ptr<Directory>* result) override {
unique_ptr<Directory> r;
Status s = target()->NewDirectory(name, &r);
ASSERT_OK(s);
if (!s.ok()) {
return s;
}
result->reset(new TestDirectory(this, TrimDirname(name), r.release()));
return Status::OK();
}
Status NewWritableFile(const std::string& fname,
unique_ptr<WritableFile>* result,
const EnvOptions& soptions) {
Status s = target()->NewWritableFile(fname, result, soptions);
if (s.ok()) {
result->reset(new TestWritableFile(fname, std::move(*result), this));
// WritableFile doesn't append to files, so if the same file is opened
// again then it will be truncated - so forget our saved state.
UntrackFile(fname);
MutexLock l(&mutex_);
open_files_.insert(fname);
auto dir_and_name = GetDirAndName(fname);
auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
list.insert(dir_and_name.second);
}
return s;
}
virtual Status DeleteFile(const std::string& f) {
Status s = EnvWrapper::DeleteFile(f);
if (!s.ok()) {
fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(),
s.ToString().c_str());
}
ASSERT_OK(s);
if (s.ok()) {
UntrackFile(f);
}
return s;
}
virtual Status RenameFile(const std::string& s, const std::string& t) {
Status ret = EnvWrapper::RenameFile(s, t);
if (ret.ok()) {
MutexLock l(&mutex_);
if (db_file_state_.find(s) != db_file_state_.end()) {
db_file_state_[t] = db_file_state_[s];
db_file_state_.erase(s);
}
auto sdn = GetDirAndName(s);
auto tdn = GetDirAndName(t);
if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
assert(tlist.find(tdn.second) == tlist.end());
tlist.insert(tdn.second);
}
}
return ret;
}
void WritableFileClosed(const FileState& state) {
MutexLock l(&mutex_);
if (open_files_.find(state.filename_) != open_files_.end()) {
db_file_state_[state.filename_] = state;
open_files_.erase(state.filename_);
}
}
// For every file that is not fully synced, make a call to `func` with
// FileState of the file as the parameter.
Status DropFileData(std::function<Status(Env*, FileState)> func) {
Status s;
MutexLock l(&mutex_);
for (std::map<std::string, FileState>::const_iterator it =
db_file_state_.begin();
s.ok() && it != db_file_state_.end(); ++it) {
const FileState& state = it->second;
if (!state.IsFullySynced()) {
s = func(target(), state);
}
}
return s;
}
Status DropUnsyncedFileData() {
return DropFileData([&](Env* env, const FileState& state) {
return state.DropUnsyncedData(env);
});
}
Status DropRandomUnsyncedFileData(Random* rnd) {
return DropFileData([&](Env* env, const FileState& state) {
return state.DropRandomUnsyncedData(env, rnd);
});
}
Status DeleteFilesCreatedAfterLastDirSync() {
// Because DeleteFile access this container make a copy to avoid deadlock
std::map<std::string, std::set<std::string>> map_copy;
{
MutexLock l(&mutex_);
map_copy.insert(dir_to_new_files_since_last_sync_.begin(),
dir_to_new_files_since_last_sync_.end());
}
for (auto& pair : map_copy) {
for (std::string name : pair.second) {
Status s = DeleteFile(pair.first + "/" + name);
if (!s.ok()) {
return s;
}
}
}
return Status::OK();
}
void ResetState() {
MutexLock l(&mutex_);
db_file_state_.clear();
dir_to_new_files_since_last_sync_.clear();
SetFilesystemActiveNoLock(true);
}
void UntrackFile(const std::string& f) {
MutexLock l(&mutex_);
auto dir_and_name = GetDirAndName(f);
dir_to_new_files_since_last_sync_[dir_and_name.first].erase(
dir_and_name.second);
db_file_state_.erase(f);
open_files_.erase(f);
}
void SyncDir(const std::string& dirname) {
MutexLock l(&mutex_);
dir_to_new_files_since_last_sync_.erase(dirname);
}
// Setting the filesystem to inactive is the test equivalent to simulating a
// system reset. Setting to inactive will freeze our saved filesystem state so
// that it will stop being recorded. It can then be reset back to the state at
// the time of the reset.
bool IsFilesystemActive() {
MutexLock l(&mutex_);
return filesystem_active_;
}
void SetFilesystemActiveNoLock(bool active) { filesystem_active_ = active; }
void SetFilesystemActive(bool active) {
MutexLock l(&mutex_);
SetFilesystemActiveNoLock(active);
}
void AssertNoOpenFile() { ASSERT_TRUE(open_files_.empty()); }
private:
port::Mutex mutex_;
std::map<std::string, FileState> db_file_state_;
std::set<std::string> open_files_;
std::unordered_map<std::string, std::set<std::string>>
dir_to_new_files_since_last_sync_;
bool filesystem_active_; // Record flushes, syncs, writes
};
Status FileState::DropUnsyncedData(Env* env) const {
ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
return Truncate(env, filename_, sync_pos);
}
Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const {
ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
assert(pos_ >= sync_pos);
int range = static_cast<int>(pos_ - sync_pos);
uint64_t truncated_size =
static_cast<uint64_t>(sync_pos) + rand->Uniform(range);
return Truncate(env, filename_, truncated_size);
}
Status TestDirectory::Fsync() {
env_->SyncDir(dirname_);
return dir_->Fsync();
}
TestWritableFile::TestWritableFile(const std::string& fname,
unique_ptr<WritableFile>&& f,
FaultInjectionTestEnv* env)
: state_(fname),
target_(std::move(f)),
writable_file_opened_(true),
env_(env) {
assert(target_ != nullptr);
state_.pos_ = 0;
}
TestWritableFile::~TestWritableFile() {
if (writable_file_opened_) {
Close();
}
}
Status TestWritableFile::Append(const Slice& data) {
Status s = target_->Append(data);
if (s.ok() && env_->IsFilesystemActive()) {
state_.pos_ += data.size();
}
return s;
}
Status TestWritableFile::Close() {
writable_file_opened_ = false;
Status s = target_->Close();
if (s.ok()) {
env_->WritableFileClosed(state_);
}
return s;
}
Status TestWritableFile::Flush() {
Status s = target_->Flush();
if (s.ok() && env_->IsFilesystemActive()) {
state_.pos_at_last_flush_ = state_.pos_;
}
return s;
}
Status TestWritableFile::Sync() {
if (!env_->IsFilesystemActive()) {
return Status::OK();
}
// No need to actual sync.
state_.pos_at_last_sync_ = state_.pos_;
return Status::OK();
}
class FaultInjectionTest {
protected:
enum OptionConfig {
kDefault,
kDifferentDataDir,
kWalDir,
kSyncWal,
kWalDirSyncWal,
kMultiLevels,
kEnd,
};
int option_config_;
// When need to make sure data is persistent, sync WAL
bool sync_use_wal_;
// When need to make sure data is persistent, call DB::CompactRange()
bool sync_use_compact_;
protected:
public:
enum ExpectedVerifResult { kValExpectFound, kValExpectNoError };
enum ResetMethod {
kResetDropUnsyncedData,
kResetDropRandomUnsyncedData,
kResetDeleteUnsyncedFiles,
kResetDropAndDeleteUnsynced
};
std::unique_ptr<Env> base_env_;
FaultInjectionTestEnv* env_;
std::string dbname_;
shared_ptr<Cache> tiny_cache_;
Options options_;
DB* db_;
FaultInjectionTest()
: option_config_(kDefault),
sync_use_wal_(false),
sync_use_compact_(true),
base_env_(nullptr),
env_(NULL),
db_(NULL) {
NewDB();
}
~FaultInjectionTest() { ASSERT_OK(TearDown()); }
bool ChangeOptions() {
option_config_++;
if (option_config_ >= kEnd) {
return false;
} else {
if (option_config_ == kMultiLevels) {
base_env_.reset(new MockEnv(Env::Default()));
}
return true;
}
}
// Return the current option configuration.
Options CurrentOptions() {
sync_use_wal_ = false;
sync_use_compact_ = true;
Options options;
switch (option_config_) {
case kWalDir:
options.wal_dir = test::TmpDir(env_) + "/fault_test_wal";
break;
case kDifferentDataDir:
options.db_paths.emplace_back(test::TmpDir(env_) + "/fault_test_data",
1000000U);
break;
case kSyncWal:
sync_use_wal_ = true;
sync_use_compact_ = false;
break;
case kWalDirSyncWal:
options.wal_dir = test::TmpDir(env_) + "/fault_test_wal";
sync_use_wal_ = true;
sync_use_compact_ = false;
break;
case kMultiLevels:
options.write_buffer_size = 64 * 1024;
options.target_file_size_base = 64 * 1024;
options.level0_file_num_compaction_trigger = 2;
options.level0_slowdown_writes_trigger = 2;
options.level0_stop_writes_trigger = 4;
options.max_bytes_for_level_base = 128 * 1024;
options.max_write_buffer_number = 2;
options.max_background_compactions = 8;
options.max_background_flushes = 8;
sync_use_wal_ = true;
sync_use_compact_ = false;
break;
default:
break;
}
return options;
}
Status NewDB() {
assert(db_ == NULL);
assert(tiny_cache_ == nullptr);
assert(env_ == NULL);
env_ =
new FaultInjectionTestEnv(base_env_ ? base_env_.get() : Env::Default());
options_ = CurrentOptions();
options_.env = env_;
options_.paranoid_checks = true;
BlockBasedTableOptions table_options;
tiny_cache_ = NewLRUCache(100);
table_options.block_cache = tiny_cache_;
options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
dbname_ = test::TmpDir() + "/fault_test";
ASSERT_OK(DestroyDB(dbname_, options_));
options_.create_if_missing = true;
Status s = OpenDB();
options_.create_if_missing = false;
return s;
}
Status SetUp() {
Status s = TearDown();
if (s.ok()) {
s = NewDB();
}
return s;
}
Status TearDown() {
CloseDB();
Status s = DestroyDB(dbname_, options_);
delete env_;
env_ = NULL;
tiny_cache_.reset();
return s;
}
void Build(const WriteOptions& write_options, int start_idx, int num_vals) {
std::string key_space, value_space;
WriteBatch batch;
for (int i = start_idx; i < start_idx + num_vals; i++) {
Slice key = Key(i, &key_space);
batch.Clear();
batch.Put(key, Value(i, &value_space));
ASSERT_OK(db_->Write(write_options, &batch));
}
}
Status ReadValue(int i, std::string* val) const {
std::string key_space, value_space;
Slice key = Key(i, &key_space);
Value(i, &value_space);
ReadOptions options;
return db_->Get(options, key, val);
}
Status Verify(int start_idx, int num_vals,
ExpectedVerifResult expected) const {
std::string val;
std::string value_space;
Status s;
for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
Value(i, &value_space);
s = ReadValue(i, &val);
if (s.ok()) {
ASSERT_EQ(value_space, val);
}
if (expected == kValExpectFound) {
if (!s.ok()) {
fprintf(stderr, "Error when read %dth record (expect found): %s\n", i,
s.ToString().c_str());
return s;
}
} else if (!s.ok() && !s.IsNotFound()) {
fprintf(stderr, "Error when read %dth record: %s\n", i,
s.ToString().c_str());
return s;
}
}
return Status::OK();
}
// Return the ith key
Slice Key(int i, std::string* storage) const {
char buf[100];
snprintf(buf, sizeof(buf), "%016d", i);
storage->assign(buf, strlen(buf));
return Slice(*storage);
}
// Return the value to associate with the specified key
Slice Value(int k, std::string* storage) const {
Random r(k);
return test::RandomString(&r, kValueSize, storage);
}
Status OpenDB() {
delete db_;
db_ = NULL;
env_->ResetState();
return DB::Open(options_, dbname_, &db_);
}
void CloseDB() {
delete db_;
db_ = NULL;
}
void DeleteAllData() {
Iterator* iter = db_->NewIterator(ReadOptions());
WriteOptions options;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
}
delete iter;
FlushOptions flush_options;
flush_options.wait = true;
db_->Flush(flush_options);
}
// rnd cannot be null for kResetDropRandomUnsyncedData
void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) {
env_->AssertNoOpenFile();
switch (reset_method) {
case kResetDropUnsyncedData:
ASSERT_OK(env_->DropUnsyncedFileData());
break;
case kResetDropRandomUnsyncedData:
ASSERT_OK(env_->DropRandomUnsyncedFileData(rnd));
break;
case kResetDeleteUnsyncedFiles:
ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
break;
case kResetDropAndDeleteUnsynced:
ASSERT_OK(env_->DropUnsyncedFileData());
ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
break;
default:
assert(false);
}
}
void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
DeleteAllData();
WriteOptions write_options;
write_options.sync = sync_use_wal_;
Build(write_options, 0, num_pre_sync);
if (sync_use_compact_) {
db_->CompactRange(nullptr, nullptr);
}
write_options.sync = false;
Build(write_options, num_pre_sync, num_post_sync);
}
void PartialCompactTestReopenWithFault(ResetMethod reset_method,
int num_pre_sync, int num_post_sync,
Random* rnd = nullptr) {
env_->SetFilesystemActive(false);
CloseDB();
ResetDBState(reset_method, rnd);
ASSERT_OK(OpenDB());
ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
ASSERT_OK(Verify(num_pre_sync, num_post_sync,
FaultInjectionTest::kValExpectNoError));
}
void NoWriteTestPreFault() {
}
void NoWriteTestReopenWithFault(ResetMethod reset_method) {
CloseDB();
ResetDBState(reset_method);
ASSERT_OK(OpenDB());
}
};
TEST(FaultInjectionTest, FaultTest) {
do {
Random rnd(301);
ASSERT_OK(SetUp());
for (size_t idx = 0; idx < kNumIterations; idx++) {
int num_pre_sync = rnd.Uniform(kMaxNumValues);
int num_post_sync = rnd.Uniform(kMaxNumValues);
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync,
num_post_sync);
NoWriteTestPreFault();
NoWriteTestReopenWithFault(kResetDropUnsyncedData);
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
PartialCompactTestReopenWithFault(kResetDropRandomUnsyncedData,
num_pre_sync, num_post_sync, &rnd);
NoWriteTestPreFault();
NoWriteTestReopenWithFault(kResetDropUnsyncedData);
// Setting a separate data path won't pass the test as we don't sync
// it after creating new files,
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
num_pre_sync, num_post_sync);
NoWriteTestPreFault();
NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
// No new files created so we expect all values since no files will be
// dropped.
PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles, num_pre_sync,
num_post_sync);
NoWriteTestPreFault();
NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
}
} while (ChangeOptions());
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

@ -17,17 +17,16 @@ namespace rocksdb {
FileIndexer::FileIndexer(const Comparator* ucmp)
: num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {}
uint32_t FileIndexer::NumLevelIndex() {
return next_level_index_.size();
}
size_t FileIndexer::NumLevelIndex() const { return next_level_index_.size(); }
uint32_t FileIndexer::LevelIndexSize(uint32_t level) {
size_t FileIndexer::LevelIndexSize(size_t level) const {
return next_level_index_[level].num_index;
}
void FileIndexer::GetNextLevelIndex(
const uint32_t level, const uint32_t file_index, const int cmp_smallest,
const int cmp_largest, int32_t* left_bound, int32_t* right_bound) {
void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index,
const int cmp_smallest,
const int cmp_largest, int32_t* left_bound,
int32_t* right_bound) const {
assert(level > 0);
// Last level, no hint
@ -69,7 +68,7 @@ void FileIndexer::GetNextLevelIndex(
assert(*right_bound <= level_rb_[level + 1]);
}
void FileIndexer::UpdateIndex(Arena* arena, const uint32_t num_levels,
void FileIndexer::UpdateIndex(Arena* arena, const size_t num_levels,
std::vector<FileMetaData*>* const files) {
if (files == nullptr) {
return;
@ -90,17 +89,17 @@ void FileIndexer::UpdateIndex(Arena* arena, const uint32_t num_levels,
}
// L1 - Ln-1
for (uint32_t level = 1; level < num_levels_ - 1; ++level) {
for (size_t level = 1; level < num_levels_ - 1; ++level) {
const auto& upper_files = files[level];
const int32_t upper_size = upper_files.size();
const int32_t upper_size = static_cast<int32_t>(upper_files.size());
const auto& lower_files = files[level + 1];
level_rb_[level] = upper_files.size() - 1;
level_rb_[level] = static_cast<int32_t>(upper_files.size()) - 1;
if (upper_size == 0) {
continue;
}
IndexLevel& index_level = next_level_index_[level];
index_level.num_index = upper_size;
char* mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit));
mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit));
index_level.index_units = new (mem) IndexUnit[upper_size];
CalculateLB(
@ -129,7 +128,8 @@ void FileIndexer::UpdateIndex(Arena* arena, const uint32_t num_levels,
[](IndexUnit* index, int32_t f_idx) { index->largest_rb = f_idx; });
}
level_rb_[num_levels_ - 1] = files[num_levels_ - 1].size() - 1;
level_rb_[num_levels_ - 1] =
static_cast<int32_t>(files[num_levels_ - 1].size()) - 1;
}
void FileIndexer::CalculateLB(
@ -137,8 +137,8 @@ void FileIndexer::CalculateLB(
const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
std::function<void(IndexUnit*, int32_t)> set_index) {
const int32_t upper_size = upper_files.size();
const int32_t lower_size = lower_files.size();
const int32_t upper_size = static_cast<int32_t>(upper_files.size());
const int32_t lower_size = static_cast<int32_t>(lower_files.size());
int32_t upper_idx = 0;
int32_t lower_idx = 0;
@ -175,8 +175,8 @@ void FileIndexer::CalculateRB(
const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
std::function<void(IndexUnit*, int32_t)> set_index) {
const int32_t upper_size = upper_files.size();
const int32_t lower_size = lower_files.size();
const int32_t upper_size = static_cast<int32_t>(upper_files.size());
const int32_t lower_size = static_cast<int32_t>(lower_files.size());
int32_t upper_idx = upper_size - 1;
int32_t lower_idx = lower_size - 1;

@ -42,19 +42,19 @@ class FileIndexer {
public:
explicit FileIndexer(const Comparator* ucmp);
uint32_t NumLevelIndex();
size_t NumLevelIndex() const;
uint32_t LevelIndexSize(uint32_t level);
size_t LevelIndexSize(size_t level) const;
// Return a file index range in the next level to search for a key based on
// smallest and largest key comparision for the current file specified by
// level and file_index. When *left_index < *right_index, both index should
// be valid and fit in the vector size.
void GetNextLevelIndex(
const uint32_t level, const uint32_t file_index, const int cmp_smallest,
const int cmp_largest, int32_t* left_bound, int32_t* right_bound);
void GetNextLevelIndex(const size_t level, const size_t file_index,
const int cmp_smallest, const int cmp_largest,
int32_t* left_bound, int32_t* right_bound) const;
void UpdateIndex(Arena* arena, const uint32_t num_levels,
void UpdateIndex(Arena* arena, const size_t num_levels,
std::vector<FileMetaData*>* const files);
enum {
@ -62,7 +62,7 @@ class FileIndexer {
};
private:
uint32_t num_levels_;
size_t num_levels_;
const Comparator* ucmp_;
struct IndexUnit {

@ -22,8 +22,15 @@ class IntComparator : public Comparator {
int Compare(const Slice& a, const Slice& b) const {
assert(a.size() == 8);
assert(b.size() == 8);
return *reinterpret_cast<const int64_t*>(a.data()) -
*reinterpret_cast<const int64_t*>(b.data());
int64_t diff = *reinterpret_cast<const int64_t*>(a.data()) -
*reinterpret_cast<const int64_t*>(b.data());
if (diff < 0) {
return -1;
} else if (diff == 0) {
return 0;
} else {
return 1;
}
}
const char* Name() const {
@ -94,7 +101,6 @@ TEST(FileIndexerTest, Empty) {
// Case 1: no overlap, files are on the left of next level files
TEST(FileIndexerTest, no_overlap_left) {
Arena arena;
uint32_t kNumLevels = 4;
indexer = new FileIndexer(&ucmp);
// level 1
AddFile(1, 100, 200);
@ -135,7 +141,6 @@ TEST(FileIndexerTest, no_overlap_left) {
// Case 2: no overlap, files are on the right of next level files
TEST(FileIndexerTest, no_overlap_right) {
Arena arena;
uint32_t kNumLevels = 4;
indexer = new FileIndexer(&ucmp);
// level 1
AddFile(1, 2100, 2200);
@ -178,7 +183,6 @@ TEST(FileIndexerTest, no_overlap_right) {
// Case 3: empty L2
TEST(FileIndexerTest, empty_L2) {
Arena arena;
uint32_t kNumLevels = 4;
indexer = new FileIndexer(&ucmp);
for (uint32_t i = 1; i < kNumLevels; ++i) {
ASSERT_EQ(0U, indexer->LevelIndexSize(i));

@ -6,7 +6,10 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include "db/filename.h"
#include <inttypes.h>
@ -16,6 +19,7 @@
#include "db/dbformat.h"
#include "rocksdb/env.h"
#include "util/logging.h"
#include "util/stop_watch.h"
namespace rocksdb {
@ -76,6 +80,17 @@ std::string MakeTableFileName(const std::string& path, uint64_t number) {
return MakeFileName(path, number, "sst");
}
uint64_t TableFileNameToNumber(const std::string& name) {
uint64_t number = 0;
uint64_t base = 1;
int pos = static_cast<int>(name.find_last_of('.'));
while (--pos >= 0 && name[pos] >= '0' && name[pos] <= '9') {
number += (name[pos] - '0') * base;
base *= 10;
}
return number;
}
std::string TableFileName(const std::vector<DbPath>& db_paths, uint64_t number,
uint32_t path_id) {
assert(number > 0);
@ -315,4 +330,16 @@ Status SetIdentityFile(Env* env, const std::string& dbname) {
return s;
}
Status SyncManifest(Env* env, const DBOptions* db_options, WritableFile* file) {
if (db_options->disableDataSync) {
return Status::OK();
} else if (db_options->use_fsync) {
StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
return file->Fsync();
} else {
StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
return file->Sync();
}
}
} // namespace rocksdb

@ -14,15 +14,18 @@
#include <unordered_map>
#include <string>
#include <vector>
#include "port/port.h"
#include "rocksdb/options.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/transaction_log.h"
#include "port/port.h"
namespace rocksdb {
class Env;
class Directory;
class WritableFile;
enum FileType {
kLogFile,
@ -36,9 +39,6 @@ enum FileType {
kIdentityFile
};
// map from file number to path ID.
typedef std::unordered_map<uint64_t, uint32_t> FileNumToPathIdMap;
// Return the name of the log file with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
@ -55,6 +55,10 @@ extern std::string ArchivedLogFileName(const std::string& dbname,
extern std::string MakeTableFileName(const std::string& name, uint64_t number);
// the reverse function of MakeTableFileName
// TODO(yhchiang): could merge this function with ParseFileName()
extern uint64_t TableFileNameToNumber(const std::string& name);
// Return the name of the sstable with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
@ -134,4 +138,8 @@ extern Status SetCurrentFile(Env* env, const std::string& dbname,
// Make the IDENTITY file for the db
extern Status SetIdentityFile(Env* env, const std::string& dbname);
// Sync manifest file `file`.
extern Status SyncManifest(Env* env, const DBOptions* db_options,
WritableFile* file);
} // namespace rocksdb

@ -0,0 +1,221 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/flush_job.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <algorithm>
#include <vector>
#include "db/builder.h"
#include "db/db_iter.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/memtable_list.h"
#include "db/merge_context.h"
#include "db/version_set.h"
#include "port/port.h"
#include "port/likely.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/statistics.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
#include "table/block.h"
#include "table/block_based_table_factory.h"
#include "table/merger.h"
#include "table/table_builder.h"
#include "table/two_level_iterator.h"
#include "util/coding.h"
#include "util/file_util.h"
#include "util/logging.h"
#include "util/log_buffer.h"
#include "util/mutexlock.h"
#include "util/perf_context_imp.h"
#include "util/iostats_context_imp.h"
#include "util/stop_watch.h"
#include "util/sync_point.h"
namespace rocksdb {
FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
const DBOptions& db_options,
const MutableCFOptions& mutable_cf_options,
const EnvOptions& env_options, VersionSet* versions,
InstrumentedMutex* db_mutex,
std::atomic<bool>* shutting_down,
SequenceNumber newest_snapshot, JobContext* job_context,
LogBuffer* log_buffer, Directory* db_directory,
Directory* output_file_directory,
CompressionType output_compression, Statistics* stats)
: dbname_(dbname),
cfd_(cfd),
db_options_(db_options),
mutable_cf_options_(mutable_cf_options),
env_options_(env_options),
versions_(versions),
db_mutex_(db_mutex),
shutting_down_(shutting_down),
newest_snapshot_(newest_snapshot),
job_context_(job_context),
log_buffer_(log_buffer),
db_directory_(db_directory),
output_file_directory_(output_file_directory),
output_compression_(output_compression),
stats_(stats) {}
Status FlushJob::Run(uint64_t* file_number) {
// Save the contents of the earliest memtable as a new Table
uint64_t fn;
autovector<MemTable*> mems;
cfd_->imm()->PickMemtablesToFlush(&mems);
if (mems.empty()) {
LogToBuffer(log_buffer_, "[%s] Nothing in memtable to flush",
cfd_->GetName().c_str());
return Status::OK();
}
// entries mems are (implicitly) sorted in ascending order by their created
// time. We will use the first memtable's `edit` to keep the meta info for
// this flush.
MemTable* m = mems[0];
VersionEdit* edit = m->GetEdits();
edit->SetPrevLogNumber(0);
// SetLogNumber(log_num) indicates logs with number smaller than log_num
// will no longer be picked up for recovery.
edit->SetLogNumber(mems.back()->GetNextLogNumber());
edit->SetColumnFamily(cfd_->GetID());
// This will release and re-acquire the mutex.
Status s = WriteLevel0Table(mems, edit, &fn);
if (s.ok() &&
(shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) {
s = Status::ShutdownInProgress(
"Database shutdown or Column family drop during flush");
}
if (!s.ok()) {
cfd_->imm()->RollbackMemtableFlush(mems, fn);
} else {
// Replace immutable memtable with the generated Table
s = cfd_->imm()->InstallMemtableFlushResults(
cfd_, mutable_cf_options_, mems, versions_, db_mutex_, fn,
&job_context_->memtables_to_free, db_directory_, log_buffer_);
}
if (s.ok() && file_number != nullptr) {
*file_number = fn;
}
return s;
}
Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
VersionEdit* edit, uint64_t* filenumber) {
db_mutex_->AssertHeld();
const uint64_t start_micros = db_options_.env->NowMicros();
FileMetaData meta;
// path 0 for level 0 file.
meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
*filenumber = meta.fd.GetNumber();
const SequenceNumber earliest_seqno_in_memtable =
mems[0]->GetFirstSequenceNumber();
Version* base = cfd_->current();
base->Ref(); // it is likely that we do not need this reference
Status s;
{
db_mutex_->Unlock();
if (log_buffer_) {
log_buffer_->FlushBufferToLog();
}
std::vector<Iterator*> memtables;
ReadOptions ro;
ro.total_order_seek = true;
Arena arena;
for (MemTable* m : mems) {
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
"[%s] Flushing memtable with next log file: %" PRIu64 "\n",
cfd_->GetName().c_str(), m->GetNextLogNumber());
memtables.push_back(m->NewIterator(ro, &arena));
}
{
ScopedArenaIterator iter(
NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
static_cast<int>(memtables.size()), &arena));
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
"[%s] Level-0 flush table #%" PRIu64 ": started",
cfd_->GetName().c_str(), meta.fd.GetNumber());
s = BuildTable(dbname_, db_options_.env, *cfd_->ioptions(), env_options_,
cfd_->table_cache(), iter.get(), &meta,
cfd_->internal_comparator(), newest_snapshot_,
earliest_seqno_in_memtable, output_compression_,
cfd_->ioptions()->compression_opts, Env::IO_HIGH);
LogFlush(db_options_.info_log);
}
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
"[%s] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s",
cfd_->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
s.ToString().c_str());
if (!db_options_.disableDataSync && output_file_directory_ != nullptr) {
output_file_directory_->Fsync();
}
db_mutex_->Lock();
}
base->Unref();
// re-acquire the most current version
base = cfd_->current();
// Note that if file_size is zero, the file has been deleted and
// should not be added to the manifest.
int level = 0;
if (s.ok() && meta.fd.GetFileSize() > 0) {
const Slice min_user_key = meta.smallest.user_key();
const Slice max_user_key = meta.largest.user_key();
// if we have more than 1 background thread, then we cannot
// insert files directly into higher levels because some other
// threads could be concurrently producing compacted files for
// that key range.
if (base != nullptr && db_options_.max_background_compactions <= 1 &&
db_options_.max_background_flushes == 0 &&
cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
level = base->storage_info()->PickLevelForMemTableOutput(
mutable_cf_options_, min_user_key, max_user_key);
// If level does not match path id, reset level back to 0
uint32_t fdpath = LevelCompactionPicker::GetPathId(
*cfd_->ioptions(), mutable_cf_options_, level);
if (fdpath != 0) {
level = 0;
}
}
edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
meta.fd.GetFileSize(), meta.smallest, meta.largest,
meta.smallest_seqno, meta.largest_seqno);
}
InternalStats::CompactionStats stats(1);
stats.micros = db_options_.env->NowMicros() - start_micros;
stats.bytes_written = meta.fd.GetFileSize();
cfd_->internal_stats()->AddCompactionStats(level, stats);
cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
meta.fd.GetFileSize());
RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
return s;
}
} // namespace rocksdb

@ -0,0 +1,87 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <atomic>
#include <deque>
#include <limits>
#include <set>
#include <utility>
#include <vector>
#include <string>
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
#include "db/column_family.h"
#include "db/version_edit.h"
#include "db/memtable_list.h"
#include "port/port.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/transaction_log.h"
#include "util/autovector.h"
#include "util/instrumented_mutex.h"
#include "util/stop_watch.h"
#include "util/thread_local.h"
#include "util/scoped_arena_iterator.h"
#include "db/internal_stats.h"
#include "db/write_controller.h"
#include "db/flush_scheduler.h"
#include "db/write_thread.h"
#include "db/job_context.h"
namespace rocksdb {
class MemTable;
class TableCache;
class Version;
class VersionEdit;
class VersionSet;
class Arena;
class FlushJob {
public:
// TODO(icanadi) make effort to reduce number of parameters here
// IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive
FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
const DBOptions& db_options,
const MutableCFOptions& mutable_cf_options,
const EnvOptions& env_options, VersionSet* versions,
InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
SequenceNumber newest_snapshot, JobContext* job_context,
LogBuffer* log_buffer, Directory* db_directory,
Directory* output_file_directory, CompressionType output_compression,
Statistics* stats);
~FlushJob() {}
Status Run(uint64_t* file_number = nullptr);
private:
Status WriteLevel0Table(const autovector<MemTable*>& mems, VersionEdit* edit,
uint64_t* filenumber);
const std::string& dbname_;
ColumnFamilyData* cfd_;
const DBOptions& db_options_;
const MutableCFOptions& mutable_cf_options_;
const EnvOptions& env_options_;
VersionSet* versions_;
InstrumentedMutex* db_mutex_;
std::atomic<bool>* shutting_down_;
SequenceNumber newest_snapshot_;
JobContext* job_context_;
LogBuffer* log_buffer_;
Directory* db_directory_;
Directory* output_file_directory_;
CompressionType output_compression_;
Statistics* stats_;
};
} // namespace rocksdb

@ -0,0 +1,124 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <map>
#include <string>
#include "db/flush_job.h"
#include "db/column_family.h"
#include "db/version_set.h"
#include "db/writebuffer.h"
#include "rocksdb/cache.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "table/mock_table.h"
namespace rocksdb {
// TODO(icanadi) Mock out everything else:
// 1. VersionSet
// 2. Memtable
class FlushJobTest {
public:
FlushJobTest()
: env_(Env::Default()),
dbname_(test::TmpDir() + "/flush_job_test"),
table_cache_(NewLRUCache(50000, 16, 8)),
write_buffer_(db_options_.db_write_buffer_size),
versions_(new VersionSet(dbname_, &db_options_, env_options_,
table_cache_.get(), &write_buffer_,
&write_controller_)),
shutting_down_(false),
mock_table_factory_(new mock::MockTableFactory()) {
ASSERT_OK(env_->CreateDirIfMissing(dbname_));
db_options_.db_paths.emplace_back(dbname_,
std::numeric_limits<uint64_t>::max());
// TODO(icanadi) Remove this once we mock out VersionSet
NewDB();
std::vector<ColumnFamilyDescriptor> column_families;
cf_options_.table_factory = mock_table_factory_;
column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
ASSERT_OK(versions_->Recover(column_families, false));
}
void NewDB() {
VersionEdit new_db;
new_db.SetLogNumber(0);
new_db.SetNextFile(2);
new_db.SetLastSequence(0);
const std::string manifest = DescriptorFileName(dbname_, 1);
unique_ptr<WritableFile> file;
Status s = env_->NewWritableFile(
manifest, &file, env_->OptimizeForManifestWrite(env_options_));
ASSERT_OK(s);
{
log::Writer log(std::move(file));
std::string record;
new_db.EncodeTo(&record);
s = log.AddRecord(record);
}
ASSERT_OK(s);
// Make "CURRENT" file that points to the new manifest file.
s = SetCurrentFile(env_, dbname_, 1, nullptr);
}
Env* env_;
std::string dbname_;
EnvOptions env_options_;
std::shared_ptr<Cache> table_cache_;
WriteController write_controller_;
DBOptions db_options_;
WriteBuffer write_buffer_;
ColumnFamilyOptions cf_options_;
std::unique_ptr<VersionSet> versions_;
InstrumentedMutex mutex_;
std::atomic<bool> shutting_down_;
std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
};
TEST(FlushJobTest, Empty) {
JobContext job_context;
auto cfd = versions_->GetColumnFamilySet()->GetDefault();
FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
db_options_, *cfd->GetLatestMutableCFOptions(),
env_options_, versions_.get(), &mutex_, &shutting_down_,
SequenceNumber(), &job_context, nullptr, nullptr, nullptr,
kNoCompression, nullptr);
ASSERT_OK(flush_job.Run());
job_context.Clean();
}
TEST(FlushJobTest, NonEmpty) {
JobContext job_context;
auto cfd = versions_->GetColumnFamilySet()->GetDefault();
auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions());
new_mem->Ref();
std::map<std::string, std::string> inserted_keys;
for (int i = 1; i < 10000; ++i) {
std::string key(ToString(i));
std::string value("value" + ToString(i));
new_mem->Add(SequenceNumber(i), kTypeValue, key, value);
InternalKey internal_key(key, SequenceNumber(i), kTypeValue);
inserted_keys.insert({internal_key.Encode().ToString(), value});
}
cfd->imm()->Add(new_mem);
FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
db_options_, *cfd->GetLatestMutableCFOptions(),
env_options_, versions_.get(), &mutex_, &shutting_down_,
SequenceNumber(), &job_context, nullptr, nullptr, nullptr,
kNoCompression, nullptr);
mutex_.Lock();
ASSERT_OK(flush_job.Run());
mutex_.Unlock();
mock_table_factory_->AssertSingleFile(inserted_keys);
job_context.Clean();
}
} // namespace rocksdb
int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

@ -0,0 +1,63 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "db/flush_scheduler.h"
#include <cassert>
#include "db/column_family.h"
namespace rocksdb {
void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) {
#ifndef NDEBUG
assert(column_families_set_.find(cfd) == column_families_set_.end());
column_families_set_.insert(cfd);
#endif // NDEBUG
cfd->Ref();
column_families_.push_back(cfd);
}
ColumnFamilyData* FlushScheduler::GetNextColumnFamily() {
ColumnFamilyData* cfd = nullptr;
while (column_families_.size() > 0) {
cfd = column_families_.front();
column_families_.pop_front();
if (cfd->IsDropped()) {
if (cfd->Unref()) {
delete cfd;
cfd = nullptr;
}
} else {
break;
}
}
#ifndef NDEBUG
if (cfd != nullptr) {
auto itr = column_families_set_.find(cfd);
assert(itr != column_families_set_.end());
column_families_set_.erase(itr);
}
#endif // NDEBUG
return cfd;
}
bool FlushScheduler::Empty() { return column_families_.empty(); }
void FlushScheduler::Clear() {
for (auto cfd : column_families_) {
#ifndef NDEBUG
auto itr = column_families_set_.find(cfd);
assert(itr != column_families_set_.end());
column_families_set_.erase(itr);
#endif // NDEBUG
if (cfd->Unref()) {
delete cfd;
}
}
column_families_.clear();
}
} // namespace rocksdb

@ -0,0 +1,40 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <stdint.h>
#include <deque>
#include <set>
#include <vector>
namespace rocksdb {
class ColumnFamilyData;
// This class is thread-compatible. It's should only be accessed from single
// write thread (between BeginWrite() and EndWrite())
class FlushScheduler {
public:
FlushScheduler() = default;
~FlushScheduler() = default;
void ScheduleFlush(ColumnFamilyData* cfd);
// Returns Ref()-ed column family. Client needs to Unref()
// REQUIRES: db mutex is held (exception is single-threaded recovery)
ColumnFamilyData* GetNextColumnFamily();
bool Empty();
void Clear();
private:
std::deque<ColumnFamilyData*> column_families_;
#ifndef NDEBUG
std::set<ColumnFamilyData*> column_families_set_;
#endif // NDEBUG
};
} // namespace rocksdb

@ -10,6 +10,7 @@
#include <string>
#include <utility>
#include "db/job_context.h"
#include "db/db_impl.h"
#include "db/db_iter.h"
#include "db/column_family.h"
@ -114,27 +115,36 @@ class LevelIterator : public Iterator {
};
ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
ColumnFamilyData* cfd)
ColumnFamilyData* cfd, SuperVersion* current_sv)
: db_(db),
read_options_(read_options),
cfd_(cfd),
prefix_extractor_(cfd->options()->prefix_extractor.get()),
prefix_extractor_(cfd->ioptions()->prefix_extractor),
user_comparator_(cfd->user_comparator()),
immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
sv_(nullptr),
sv_(current_sv),
mutable_iter_(nullptr),
current_(nullptr),
status_(Status::OK()),
immutable_status_(Status::OK()),
valid_(false),
is_prev_set_(false) {}
is_prev_set_(false),
is_prev_inclusive_(false) {
if (sv_) {
RebuildIterators(false);
}
}
ForwardIterator::~ForwardIterator() {
Cleanup();
Cleanup(true);
}
void ForwardIterator::Cleanup() {
delete mutable_iter_;
void ForwardIterator::Cleanup(bool release_sv) {
if (mutable_iter_ != nullptr) {
mutable_iter_->~Iterator();
}
for (auto* m : imm_iters_) {
delete m;
m->~Iterator();
}
imm_iters_.clear();
for (auto* f : l0_iters_) {
@ -146,15 +156,17 @@ void ForwardIterator::Cleanup() {
}
level_iters_.clear();
if (sv_ != nullptr && sv_->Unref()) {
DBImpl::DeletionState deletion_state;
db_->mutex_.Lock();
sv_->Cleanup();
db_->FindObsoleteFiles(deletion_state, false, true);
db_->mutex_.Unlock();
delete sv_;
if (deletion_state.HaveSomethingToDelete()) {
db_->PurgeObsoleteFiles(deletion_state);
if (release_sv) {
if (sv_ != nullptr && sv_->Unref()) {
JobContext job_context;
db_->mutex_.Lock();
sv_->Cleanup();
db_->FindObsoleteFiles(&job_context, false, true);
db_->mutex_.Unlock();
delete sv_;
if (job_context.HaveSomethingToDelete()) {
db_->PurgeObsoleteFiles(job_context);
}
}
}
}
@ -166,8 +178,8 @@ bool ForwardIterator::Valid() const {
void ForwardIterator::SeekToFirst() {
if (sv_ == nullptr ||
sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
RebuildIterators();
} else if (status_.IsIncomplete()) {
RebuildIterators(true);
} else if (immutable_status_.IsIncomplete()) {
ResetIncompleteIterators();
}
SeekInternal(Slice(), true);
@ -176,8 +188,8 @@ void ForwardIterator::SeekToFirst() {
void ForwardIterator::Seek(const Slice& internal_key) {
if (sv_ == nullptr ||
sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
RebuildIterators();
} else if (status_.IsIncomplete()) {
RebuildIterators(true);
} else if (immutable_status_.IsIncomplete()) {
ResetIncompleteIterators();
}
SeekInternal(internal_key, false);
@ -185,6 +197,7 @@ void ForwardIterator::Seek(const Slice& internal_key) {
void ForwardIterator::SeekInternal(const Slice& internal_key,
bool seek_to_first) {
assert(mutable_iter_);
// mutable
seek_to_first ? mutable_iter_->SeekToFirst() :
mutable_iter_->Seek(internal_key);
@ -194,13 +207,16 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
// if it turns to need to seek immutable often. We probably want to have
// an option to turn it off.
if (seek_to_first || NeedToSeekImmutable(internal_key)) {
immutable_status_ = Status::OK();
{
auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator()));
immutable_min_heap_.swap(tmp);
}
for (auto* m : imm_iters_) {
seek_to_first ? m->SeekToFirst() : m->Seek(internal_key);
if (m->Valid()) {
if (!m->status().ok()) {
immutable_status_ = m->status();
} else if (m->Valid()) {
immutable_min_heap_.push(m);
}
}
@ -209,27 +225,23 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
if (!seek_to_first) {
user_key = ExtractUserKey(internal_key);
}
auto* files = sv_->current->files_;
for (uint32_t i = 0; i < files[0].size(); ++i) {
const VersionStorageInfo* vstorage = sv_->current->storage_info();
const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
for (uint32_t i = 0; i < l0.size(); ++i) {
if (seek_to_first) {
l0_iters_[i]->SeekToFirst();
} else {
// If the target key passes over the larget key, we are sure Next()
// won't go over this file.
if (user_comparator_->Compare(user_key,
files[0][i]->largest.user_key()) > 0) {
l0[i]->largest.user_key()) > 0) {
continue;
}
l0_iters_[i]->Seek(internal_key);
}
if (l0_iters_[i]->status().IsIncomplete()) {
// if any of the immutable iterators is incomplete (no-io option was
// used), we are unable to reliably find the smallest key
assert(read_options_.read_tier == kBlockCacheTier);
status_ = l0_iters_[i]->status();
valid_ = false;
return;
if (!l0_iters_[i]->status().ok()) {
immutable_status_ = l0_iters_[i]->status();
} else if (l0_iters_[i]->Valid()) {
immutable_min_heap_.push(l0_iters_[i]);
}
@ -237,86 +249,83 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
int32_t search_left_bound = 0;
int32_t search_right_bound = FileIndexer::kLevelMaxIndex;
for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
if (files[level].empty()) {
for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
const std::vector<FileMetaData*>& level_files =
vstorage->LevelFiles(level);
if (level_files.empty()) {
search_left_bound = 0;
search_right_bound = FileIndexer::kLevelMaxIndex;
continue;
}
assert(level_iters_[level - 1] != nullptr);
uint32_t f_idx = 0;
const auto& indexer = vstorage->file_indexer();
if (!seek_to_first) {
// TODO(ljin): remove before committing
// f_idx = FindFileInRange(
// files[level], internal_key, 0, files[level].size());
if (search_left_bound == search_right_bound) {
f_idx = search_left_bound;
} else if (search_left_bound < search_right_bound) {
f_idx = FindFileInRange(
files[level], internal_key, search_left_bound,
search_right_bound == FileIndexer::kLevelMaxIndex ?
files[level].size() : search_right_bound);
f_idx =
FindFileInRange(level_files, internal_key, search_left_bound,
search_right_bound == FileIndexer::kLevelMaxIndex
? static_cast<uint32_t>(level_files.size())
: search_right_bound);
} else {
// search_left_bound > search_right_bound
// There are only 2 cases this can happen:
// (1) target key is smaller than left most file
// (2) target key is larger than right most file
assert(search_left_bound == (int32_t)files[level].size() ||
assert(search_left_bound == (int32_t)level_files.size() ||
search_right_bound == -1);
if (search_right_bound == -1) {
assert(search_left_bound == 0);
f_idx = 0;
} else {
sv_->current->file_indexer_.GetNextLevelIndex(
level, files[level].size() - 1,
indexer.GetNextLevelIndex(
level, level_files.size() - 1,
1, 1, &search_left_bound, &search_right_bound);
continue;
}
}
// Prepare hints for the next level
if (f_idx < files[level].size()) {
if (f_idx < level_files.size()) {
int cmp_smallest = user_comparator_->Compare(
user_key, files[level][f_idx]->smallest.user_key());
user_key, level_files[f_idx]->smallest.user_key());
int cmp_largest = -1;
if (cmp_smallest >= 0) {
cmp_smallest = user_comparator_->Compare(
user_key, files[level][f_idx]->smallest.user_key());
user_key, level_files[f_idx]->smallest.user_key());
}
sv_->current->file_indexer_.GetNextLevelIndex(level, f_idx,
indexer.GetNextLevelIndex(level, f_idx,
cmp_smallest, cmp_largest,
&search_left_bound, &search_right_bound);
} else {
sv_->current->file_indexer_.GetNextLevelIndex(
level, files[level].size() - 1,
indexer.GetNextLevelIndex(
level, level_files.size() - 1,
1, 1, &search_left_bound, &search_right_bound);
}
}
// Seek
if (f_idx < files[level].size()) {
if (f_idx < level_files.size()) {
level_iters_[level - 1]->SetFileIndex(f_idx);
seek_to_first ? level_iters_[level - 1]->SeekToFirst() :
level_iters_[level - 1]->Seek(internal_key);
if (level_iters_[level - 1]->status().IsIncomplete()) {
// see above
assert(read_options_.read_tier == kBlockCacheTier);
status_ = level_iters_[level - 1]->status();
valid_ = false;
return;
if (!level_iters_[level - 1]->status().ok()) {
immutable_status_ = level_iters_[level - 1]->status();
} else if (level_iters_[level - 1]->Valid()) {
immutable_min_heap_.push(level_iters_[level - 1]);
}
}
}
if (seek_to_first || immutable_min_heap_.empty()) {
if (seek_to_first) {
is_prev_set_ = false;
} else {
prev_key_.SetKey(internal_key);
is_prev_set_ = true;
is_prev_inclusive_ = true;
}
} else if (current_ && current_ != mutable_iter_) {
// current_ is one of immutable iterators, push it back to the heap
@ -334,24 +343,33 @@ void ForwardIterator::Next() {
std::string current_key = key().ToString();
Slice old_key(current_key.data(), current_key.size());
RebuildIterators();
RebuildIterators(true);
SeekInternal(old_key, false);
if (!valid_ || key().compare(old_key) != 0) {
return;
}
} else if (current_ != mutable_iter_) {
// It is going to advance immutable iterator
prev_key_.SetKey(current_->key());
is_prev_set_ = true;
bool update_prev_key = true;
if (is_prev_set_ && prefix_extractor_) {
// advance prev_key_ to current_ only if they share the same prefix
update_prev_key =
prefix_extractor_->Transform(prev_key_.GetKey()).compare(
prefix_extractor_->Transform(current_->key())) == 0;
}
if (update_prev_key) {
prev_key_.SetKey(current_->key());
is_prev_set_ = true;
is_prev_inclusive_ = false;
}
}
current_->Next();
if (current_ != mutable_iter_) {
if (current_->status().IsIncomplete()) {
assert(read_options_.read_tier == kBlockCacheTier);
status_ = current_->status();
valid_ = false;
return;
if (!current_->status().ok()) {
immutable_status_ = current_->status();
} else if (current_->Valid()) {
immutable_min_heap_.push(current_);
}
@ -377,45 +395,35 @@ Status ForwardIterator::status() const {
return mutable_iter_->status();
}
for (auto *it : imm_iters_) {
if (it && !it->status().ok()) {
return it->status();
}
}
for (auto *it : l0_iters_) {
if (it && !it->status().ok()) {
return it->status();
}
}
for (auto *it : level_iters_) {
if (it && !it->status().ok()) {
return it->status();
}
}
return Status::OK();
return immutable_status_;
}
void ForwardIterator::RebuildIterators() {
void ForwardIterator::RebuildIterators(bool refresh_sv) {
// Clean up
Cleanup();
// New
sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
mutable_iter_ = sv_->mem->NewIterator(read_options_);
sv_->imm->AddIterators(read_options_, &imm_iters_);
const auto& l0_files = sv_->current->files_[0];
Cleanup(refresh_sv);
if (refresh_sv) {
// New
sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
}
mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
const auto* vstorage = sv_->current->storage_info();
const auto& l0_files = vstorage->LevelFiles(0);
l0_iters_.reserve(l0_files.size());
for (const auto* l0 : l0_files) {
l0_iters_.push_back(cfd_->table_cache()->NewIterator(
read_options_, *cfd_->soptions(), cfd_->internal_comparator(), l0->fd));
}
level_iters_.reserve(sv_->current->NumberLevels() - 1);
for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
if (sv_->current->files_[level].empty()) {
level_iters_.reserve(vstorage->num_levels() - 1);
for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
const auto& level_files = vstorage->LevelFiles(level);
if (level_files.empty()) {
level_iters_.push_back(nullptr);
} else {
level_iters_.push_back(new LevelIterator(cfd_, read_options_,
sv_->current->files_[level]));
level_iters_.push_back(
new LevelIterator(cfd_, read_options_, level_files));
}
}
@ -424,7 +432,7 @@ void ForwardIterator::RebuildIterators() {
}
void ForwardIterator::ResetIncompleteIterators() {
const auto& l0_files = sv_->current->files_[0];
const auto& l0_files = sv_->current->storage_info()->LevelFiles(0);
for (uint32_t i = 0; i < l0_iters_.size(); ++i) {
assert(i < l0_files.size());
if (!l0_iters_[i]->status().IsIncomplete()) {
@ -474,7 +482,14 @@ void ForwardIterator::UpdateCurrent() {
}
bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
if (!valid_ || !is_prev_set_) {
// We maintain the interval (prev_key_, immutable_min_heap_.top()->key())
// such that there are no records with keys within that range in
// immutable_min_heap_. Since immutable structures (SST files and immutable
// memtables) can't change in this version, we don't need to do a seek if
// 'target' belongs to that interval (immutable_min_heap_.top() is already
// at the correct position).
if (!valid_ || !current_ || !is_prev_set_ || !immutable_status_.ok()) {
return true;
}
Slice prev_key = prev_key_.GetKey();
@ -483,13 +498,17 @@ bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
return true;
}
if (cfd_->internal_comparator().InternalKeyComparator::Compare(
prev_key, target) >= 0) {
prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) {
return true;
}
if (immutable_min_heap_.empty() ||
cfd_->internal_comparator().InternalKeyComparator::Compare(
target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
: current_->key()) > 0) {
if (immutable_min_heap_.empty() && current_ == mutable_iter_) {
// Nothing to seek on.
return false;
}
if (cfd_->internal_comparator().InternalKeyComparator::Compare(
target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
: current_->key()) > 0) {
return true;
}
return false;

@ -14,6 +14,7 @@
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "db/dbformat.h"
#include "util/arena.h"
namespace rocksdb {
@ -50,7 +51,7 @@ typedef std::priority_queue<Iterator*,
class ForwardIterator : public Iterator {
public:
ForwardIterator(DBImpl* db, const ReadOptions& read_options,
ColumnFamilyData* cfd);
ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr);
virtual ~ForwardIterator();
void SeekToLast() override {
@ -71,8 +72,8 @@ class ForwardIterator : public Iterator {
virtual Status status() const override;
private:
void Cleanup();
void RebuildIterators();
void Cleanup(bool release_sv);
void RebuildIterators(bool refresh_sv);
void ResetIncompleteIterators();
void SeekInternal(const Slice& internal_key, bool seek_to_first);
void UpdateCurrent();
@ -96,10 +97,13 @@ class ForwardIterator : public Iterator {
Iterator* current_;
// internal iterator status
Status status_;
Status immutable_status_;
bool valid_;
IterKey prev_key_;
bool is_prev_set_;
bool is_prev_inclusive_;
Arena arena_;
};
} // namespace rocksdb

@ -7,14 +7,21 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/internal_stats.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <vector>
#include "db/column_family.h"
#include "db/db_impl.h"
#include "util/string_util.h"
namespace rocksdb {
#ifndef ROCKSDB_LITE
namespace {
const double kMB = 1048576.0;
const double kGB = kMB * 1024;
@ -24,64 +31,55 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name) {
buf, len,
"\n** Compaction Stats [%s] **\n"
"Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) "
"Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) "
"Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) "
"Stall(sec) Stall(cnt) Avg(ms)\n"
"Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) "
"Comp(sec) Comp(cnt) Avg(sec) "
"Stall(sec) Stall(cnt) Avg(ms) RecordIn RecordDrop\n"
"--------------------------------------------------------------------"
"--------------------------------------------------------------------"
"--------------------------------------------------------------------\n",
"----------------------------------------------------------\n",
cf_name.c_str());
}
void PrintLevelStats(char* buf, size_t len, const std::string& name,
int num_files, int being_compacted, double total_file_size, double score,
double rw_amp, double w_amp, double stall_us, uint64_t stalls,
double w_amp, double stall_us, uint64_t stalls,
const InternalStats::CompactionStats& stats) {
uint64_t bytes_read = stats.bytes_readn + stats.bytes_readnp1;
uint64_t bytes_new = stats.bytes_written - stats.bytes_readnp1;
double elapsed = (stats.micros + 1) / 1000000.0;
snprintf(buf, len,
"%4s %5d/%-3d %8.0f %5.1f " /* Level, Files, Size(MB), Score */
"%8.1f " /* Read(GB) */
"%7.1f " /* Rn(GB) */
"%8.1f " /* Rnp1(GB) */
"%9.1f " /* Write(GB) */
"%8.1f " /* Wnew(GB) */
"%6.1f " /* RW-Amp */
"%5.1f " /* W-Amp */
"%8.1f " /* Rd(MB/s) */
"%8.1f " /* Wr(MB/s) */
"%8d " /* Rn(cnt) */
"%9d " /* Rnp1(cnt) */
"%9d " /* Wnp1(cnt) */
"%9d " /* Wnew(cnt) */
"%10.0f " /* Comp(sec) */
"%9d " /* Comp(cnt) */
"%8.3f " /* Avg(sec) */
"%10.2f " /* Stall(sec) */
"%10" PRIu64 " " /* Stall(cnt) */
"%7.2f\n" /* Avg(ms) */,
name.c_str(), num_files, being_compacted, total_file_size / kMB, score,
bytes_read / kGB,
stats.bytes_readn / kGB,
stats.bytes_readnp1 / kGB,
stats.bytes_written / kGB,
bytes_new / kGB,
rw_amp,
w_amp,
bytes_read / kMB / elapsed,
stats.bytes_written / kMB / elapsed,
stats.files_in_leveln,
stats.files_in_levelnp1,
stats.files_out_levelnp1,
stats.files_out_levelnp1 - stats.files_in_levelnp1,
stats.micros / 1000000.0,
stats.count,
stats.count == 0 ? 0 : stats.micros / 1000000.0 / stats.count,
stall_us / 1000000.0,
stalls,
stalls == 0 ? 0 : stall_us / 1000.0 / stalls);
"%4s %5d/%-3d %8.0f %5.1f " /* Level, Files, Size(MB), Score */
"%8.1f " /* Read(GB) */
"%7.1f " /* Rn(GB) */
"%8.1f " /* Rnp1(GB) */
"%9.1f " /* Write(GB) */
"%8.1f " /* Wnew(GB) */
"%9.1f " /* Moved(GB) */
"%5.1f " /* W-Amp */
"%8.1f " /* Rd(MB/s) */
"%8.1f " /* Wr(MB/s) */
"%9.0f " /* Comp(sec) */
"%9d " /* Comp(cnt) */
"%8.3f " /* Avg(sec) */
"%10.2f " /* Stall(sec) */
"%10" PRIu64
" " /* Stall(cnt) */
"%7.2f " /* Avg(ms) */
"%12" PRIu64
" " /* input entries */
"%12" PRIu64 "\n" /* number of records reduced */,
name.c_str(), num_files, being_compacted, total_file_size / kMB,
score, bytes_read / kGB, stats.bytes_readn / kGB,
stats.bytes_readnp1 / kGB, stats.bytes_written / kGB,
bytes_new / kGB, stats.bytes_moved / kGB,
w_amp, bytes_read / kMB / elapsed,
stats.bytes_written / kMB / elapsed,
stats.micros / 1000000.0, stats.count,
stats.count == 0 ? 0 : stats.micros / 1000000.0 / stats.count,
stall_us / 1000000.0, stalls,
stalls == 0 ? 0 : stall_us / 1000.0 / stalls,
stats.num_input_records, stats.num_dropped_records);
}
@ -125,6 +123,8 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
return kBackgroundErrors;
} else if (in == "cur-size-active-mem-table") {
return kCurSizeActiveMemTable;
} else if (in == "cur-size-all-mem-tables") {
return kCurSizeAllMemTables;
} else if (in == "num-entries-active-mem-table") {
return kNumEntriesInMutableMemtable;
} else if (in == "num-entries-imm-mem-tables") {
@ -136,6 +136,10 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
return kEstimatedUsageByTableReaders;
} else if (in == "is-file-deletions-enabled") {
return kIsFileDeletionEnabled;
} else if (in == "num-snapshots") {
return kNumSnapshots;
} else if (in == "oldest-snapshot-time") {
return kOldestSnapshotTime;
}
return kUnknown;
}
@ -159,7 +163,8 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
const Slice& property,
std::string* value) {
assert(value != nullptr);
Version* current = cfd_->current();
auto* current = cfd_->current();
const auto* vstorage = current->storage_info();
Slice in = property;
switch (property_type) {
@ -172,7 +177,7 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
} else {
char buf[100];
snprintf(buf, sizeof(buf), "%d",
current->NumLevelFiles(static_cast<int>(level)));
vstorage->NumLevelFiles(static_cast<int>(level)));
*value = buf;
return true;
}
@ -186,8 +191,8 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
for (int level = 0; level < number_levels_; level++) {
snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level,
current->NumLevelFiles(level),
current->NumLevelBytes(level) / kMB);
vstorage->NumLevelFiles(level),
vstorage->NumLevelBytes(level) / kMB);
value->append(buf);
}
return true;
@ -219,7 +224,7 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
bool InternalStats::GetIntProperty(DBPropertyType property_type,
uint64_t* value, DBImpl* db) const {
Version* current = cfd_->current();
const auto* vstorage = cfd_->current()->storage_info();
switch (property_type) {
case kNumImmutableMemTable:
@ -232,7 +237,7 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
case kCompactionPending:
// 1 if the system already determines at least one compacdtion is needed.
// 0 otherwise,
*value = (current->NeedsCompaction() ? 1 : 0);
*value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0);
return true;
case kBackgroundErrors:
// Accumulated number of errors in background flushes or compactions.
@ -242,12 +247,17 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
// Current size of the active memtable
*value = cfd_->mem()->ApproximateMemoryUsage();
return true;
case kCurSizeAllMemTables:
// Current size of the active memtable + immutable memtables
*value = cfd_->mem()->ApproximateMemoryUsage() +
cfd_->imm()->ApproximateMemoryUsage();
return true;
case kNumEntriesInMutableMemtable:
// Current size of the active memtable
// Current number of entires in the active memtable
*value = cfd_->mem()->GetNumEntries();
return true;
case kNumEntriesInImmutableMemtable:
// Current size of the active memtable
// Current number of entries in the immutable memtables
*value = cfd_->imm()->current()->GetTotalNumEntries();
return true;
case kEstimatedNumKeys:
@ -255,11 +265,19 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
// Use estimated entries in tables + total entries in memtables.
*value = cfd_->mem()->GetNumEntries() +
cfd_->imm()->current()->GetTotalNumEntries() +
current->GetEstimatedActiveKeys();
vstorage->GetEstimatedActiveKeys();
return true;
case kNumSnapshots:
*value = db->snapshots().count();
return true;
case kOldestSnapshotTime:
*value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotTime());
return true;
#ifndef ROCKSDB_LITE
case kIsFileDeletionEnabled:
*value = db->IsFileDeletionsEnabled();
return true;
#endif
default:
return false;
}
@ -276,18 +294,29 @@ void InternalStats::DumpDBStats(std::string* value) {
value->append(buf);
// Cumulative
uint64_t user_bytes_written = db_stats_[InternalStats::BYTES_WRITTEN];
uint64_t num_keys_written = db_stats_[InternalStats::NUMBER_KEYS_WRITTEN];
uint64_t write_other = db_stats_[InternalStats::WRITE_DONE_BY_OTHER];
uint64_t write_self = db_stats_[InternalStats::WRITE_DONE_BY_SELF];
uint64_t wal_bytes = db_stats_[InternalStats::WAL_FILE_BYTES];
uint64_t wal_synced = db_stats_[InternalStats::WAL_FILE_SYNCED];
uint64_t write_with_wal = db_stats_[InternalStats::WRITE_WITH_WAL];
uint64_t write_stall_micros = db_stats_[InternalStats::WRITE_STALL_MICROS];
// Data
// writes: total number of write requests.
// keys: total number of key updates issued by all the write requests
// batches: number of group commits issued to the DB. Each group can contain
// one or more writes.
// so writes/keys is the average number of put in multi-put or put
// writes/batches is the average group commit size.
//
// The format is the same for interval stats.
snprintf(buf, sizeof(buf),
"Cumulative writes: %" PRIu64 " writes, %" PRIu64 " batches, "
"%.1f writes per batch, %.2f GB user ingest\n",
write_other + write_self, write_self,
"Cumulative writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64
" batches, %.1f writes per batch, %.2f GB user ingest, "
"stall micros: %" PRIu64 "\n",
write_other + write_self, num_keys_written, write_self,
(write_other + write_self) / static_cast<double>(write_self + 1),
user_bytes_written / kGB);
user_bytes_written / kGB, write_stall_micros);
value->append(buf);
// WAL
snprintf(buf, sizeof(buf),
@ -301,14 +330,18 @@ void InternalStats::DumpDBStats(std::string* value) {
// Interval
uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other;
uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self;
uint64_t interval_num_keys_written =
num_keys_written - db_stats_snapshot_.num_keys_written;
snprintf(buf, sizeof(buf),
"Interval writes: %" PRIu64 " writes, %" PRIu64 " batches, "
"%.1f writes per batch, %.1f MB user ingest\n",
"Interval writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64
" batches, %.1f writes per batch, %.1f MB user ingest, "
"stall micros: %" PRIu64 "\n",
interval_write_other + interval_write_self,
interval_write_self,
interval_num_keys_written, interval_write_self,
static_cast<double>(interval_write_other + interval_write_self) /
(interval_write_self + 1),
(user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB);
(user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
write_stall_micros - db_stats_snapshot_.write_stall_micros);
value->append(buf);
uint64_t interval_write_with_wal =
@ -330,30 +363,33 @@ void InternalStats::DumpDBStats(std::string* value) {
db_stats_snapshot_.ingest_bytes = user_bytes_written;
db_stats_snapshot_.write_other = write_other;
db_stats_snapshot_.write_self = write_self;
db_stats_snapshot_.num_keys_written = num_keys_written;
db_stats_snapshot_.wal_bytes = wal_bytes;
db_stats_snapshot_.wal_synced = wal_synced;
db_stats_snapshot_.write_with_wal = write_with_wal;
db_stats_snapshot_.write_stall_micros = write_stall_micros;
}
void InternalStats::DumpCFStats(std::string* value) {
Version* current = cfd_->current();
const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
int num_levels_to_check =
(cfd_->options()->compaction_style != kCompactionStyleUniversal &&
cfd_->options()->compaction_style != kCompactionStyleFIFO)
? current->NumberLevels() - 1
(cfd_->ioptions()->compaction_style != kCompactionStyleUniversal &&
cfd_->ioptions()->compaction_style != kCompactionStyleFIFO)
? vstorage->num_levels() - 1
: 1;
// Compaction scores are sorted base on its value. Restore them to the
// level order
std::vector<double> compaction_score(number_levels_, 0);
for (int i = 0; i < num_levels_to_check; ++i) {
compaction_score[current->compaction_level_[i]] =
current->compaction_score_[i];
compaction_score[vstorage->CompactionScoreLevel(i)] =
vstorage->CompactionScore(i);
}
// Count # of files being compacted for each level
std::vector<int> files_being_compacted(number_levels_, 0);
for (int level = 0; level < num_levels_to_check; ++level) {
for (auto* f : current->files_[level]) {
for (auto* f : vstorage->LevelFiles(level)) {
if (f->being_compacted) {
++files_being_compacted[level];
}
@ -376,7 +412,7 @@ void InternalStats::DumpCFStats(std::string* value) {
uint64_t total_stall_count = 0;
double total_stall_us = 0;
for (int level = 0; level < number_levels_; level++) {
int files = current->NumLevelFiles(level);
int files = vstorage->NumLevelFiles(level);
total_files += files;
total_files_being_compacted += files_being_compacted[level];
if (comp_stats_[level].micros > 0 || files > 0) {
@ -395,36 +431,29 @@ void InternalStats::DumpCFStats(std::string* value) {
stall_leveln_slowdown_hard_[level]);
stats_sum.Add(comp_stats_[level]);
total_file_size += current->NumLevelBytes(level);
total_file_size += vstorage->NumLevelBytes(level);
total_stall_us += stall_us;
total_stall_count += stalls;
total_slowdown_soft += stall_leveln_slowdown_soft_[level];
total_slowdown_count_soft += stall_leveln_slowdown_count_soft_[level];
total_slowdown_hard += stall_leveln_slowdown_hard_[level];
total_slowdown_count_hard += stall_leveln_slowdown_count_hard_[level];
int64_t bytes_read = comp_stats_[level].bytes_readn +
comp_stats_[level].bytes_readnp1;
double rw_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0
: (comp_stats_[level].bytes_written + bytes_read) /
static_cast<double>(comp_stats_[level].bytes_readn);
double w_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0
: comp_stats_[level].bytes_written /
static_cast<double>(comp_stats_[level].bytes_readn);
PrintLevelStats(buf, sizeof(buf), "L" + std::to_string(level),
files, files_being_compacted[level], current->NumLevelBytes(level),
compaction_score[level], rw_amp, w_amp, stall_us, stalls,
comp_stats_[level]);
PrintLevelStats(buf, sizeof(buf), "L" + ToString(level), files,
files_being_compacted[level],
vstorage->NumLevelBytes(level), compaction_score[level],
w_amp, stall_us, stalls, comp_stats_[level]);
value->append(buf);
}
}
uint64_t curr_ingest = cf_stats_value_[BYTES_FLUSHED];
// Cumulative summary
double rw_amp = (stats_sum.bytes_written + stats_sum.bytes_readn +
stats_sum.bytes_readnp1) / static_cast<double>(curr_ingest + 1);
double w_amp = stats_sum.bytes_written / static_cast<double>(curr_ingest + 1);
// Stats summary across levels
PrintLevelStats(buf, sizeof(buf), "Sum", total_files,
total_files_being_compacted, total_file_size, 0, rw_amp, w_amp,
total_files_being_compacted, total_file_size, 0, w_amp,
total_stall_us, total_stall_count, stats_sum);
value->append(buf);
// Interval summary
@ -432,12 +461,9 @@ void InternalStats::DumpCFStats(std::string* value) {
curr_ingest - cf_stats_snapshot_.ingest_bytes + 1;
CompactionStats interval_stats(stats_sum);
interval_stats.Subtract(cf_stats_snapshot_.comp_stats);
rw_amp = (interval_stats.bytes_written +
interval_stats.bytes_readn + interval_stats.bytes_readnp1) /
static_cast<double>(interval_ingest);
w_amp = interval_stats.bytes_written / static_cast<double>(interval_ingest);
PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0,
rw_amp, w_amp, total_stall_us - cf_stats_snapshot_.stall_us,
w_amp, total_stall_us - cf_stats_snapshot_.stall_us,
total_stall_count - cf_stats_snapshot_.stall_count, interval_stats);
value->append(buf);
@ -473,4 +499,14 @@ void InternalStats::DumpCFStats(std::string* value) {
cf_stats_snapshot_.stall_count = total_stall_count;
}
#else
DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
bool* need_out_of_mutex) {
return kUnknown;
}
#endif // !ROCKSDB_LITE
} // namespace rocksdb

@ -21,6 +21,8 @@ namespace rocksdb {
class MemTableList;
class DBImpl;
// IMPORTANT: If you add a new property here, also add it to the list in
// include/rocksdb/db.h
enum DBPropertyType : uint32_t {
kUnknown,
kNumFilesAtLevel, // Number of files at a specific level
@ -36,6 +38,8 @@ enum DBPropertyType : uint32_t {
kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0.
kBackgroundErrors, // Return accumulated background errors encountered.
kCurSizeActiveMemTable, // Return current size of the active memtable
kCurSizeAllMemTables, // Return current size of all (active + immutable)
// memtables
kNumEntriesInMutableMemtable, // Return number of entries in the mutable
// memtable.
kNumEntriesInImmutableMemtable, // Return sum of number of entries in all
@ -44,12 +48,16 @@ enum DBPropertyType : uint32_t {
kEstimatedUsageByTableReaders, // Estimated memory by table readers.
kIsFileDeletionEnabled, // Equals disable_delete_obsolete_files_,
// 0 means file deletions enabled
kNumSnapshots, // Number of snapshots in the system
kOldestSnapshotTime, // Unix timestamp of the first snapshot
};
extern DBPropertyType GetPropertyType(const Slice& property,
bool* is_int_property,
bool* need_out_of_mutex);
#ifndef ROCKSDB_LITE
class InternalStats {
public:
enum InternalCFStatsType {
@ -65,9 +73,11 @@ class InternalStats {
WAL_FILE_BYTES,
WAL_FILE_SYNCED,
BYTES_WRITTEN,
NUMBER_KEYS_WRITTEN,
WRITE_DONE_BY_OTHER,
WRITE_DONE_BY_SELF,
WRITE_WITH_WAL,
WRITE_STALL_MICROS,
INTERNAL_DB_STATS_ENUM_MAX,
};
@ -114,6 +124,9 @@ class InternalStats {
// Total bytes written during compaction between levels N and N+1
uint64_t bytes_written;
// Total bytes moved to this level
uint64_t bytes_moved;
// Files read from level N during compaction between levels N and N+1
int files_in_leveln;
@ -123,27 +136,40 @@ class InternalStats {
// Files written during compaction between levels N and N+1
int files_out_levelnp1;
// Total incoming entries during compaction between levels N and N+1
uint64_t num_input_records;
// Accumulated diff number of entries
// (num input entries - num output entires) for compaction levels N and N+1
uint64_t num_dropped_records;
// Number of compactions done
int count;
explicit CompactionStats(int count = 0)
explicit CompactionStats(int _count = 0)
: micros(0),
bytes_readn(0),
bytes_readnp1(0),
bytes_written(0),
bytes_moved(0),
files_in_leveln(0),
files_in_levelnp1(0),
files_out_levelnp1(0),
count(count) {}
num_input_records(0),
num_dropped_records(0),
count(_count) {}
explicit CompactionStats(const CompactionStats& c)
: micros(c.micros),
bytes_readn(c.bytes_readn),
bytes_readnp1(c.bytes_readnp1),
bytes_written(c.bytes_written),
bytes_moved(c.bytes_moved),
files_in_leveln(c.files_in_leveln),
files_in_levelnp1(c.files_in_levelnp1),
files_out_levelnp1(c.files_out_levelnp1),
num_input_records(c.num_input_records),
num_dropped_records(c.num_dropped_records),
count(c.count) {}
void Add(const CompactionStats& c) {
@ -151,9 +177,12 @@ class InternalStats {
this->bytes_readn += c.bytes_readn;
this->bytes_readnp1 += c.bytes_readnp1;
this->bytes_written += c.bytes_written;
this->bytes_moved += c.bytes_moved;
this->files_in_leveln += c.files_in_leveln;
this->files_in_levelnp1 += c.files_in_levelnp1;
this->files_out_levelnp1 += c.files_out_levelnp1;
this->num_input_records += c.num_input_records;
this->num_dropped_records += c.num_dropped_records;
this->count += c.count;
}
@ -162,9 +191,12 @@ class InternalStats {
this->bytes_readn -= c.bytes_readn;
this->bytes_readnp1 -= c.bytes_readnp1;
this->bytes_written -= c.bytes_written;
this->bytes_moved -= c.bytes_moved;
this->files_in_leveln -= c.files_in_leveln;
this->files_in_levelnp1 -= c.files_in_levelnp1;
this->files_out_levelnp1 -= c.files_out_levelnp1;
this->num_input_records -= c.num_input_records;
this->num_dropped_records -= c.num_dropped_records;
this->count -= c.count;
}
};
@ -173,6 +205,10 @@ class InternalStats {
comp_stats_[level].Add(stats);
}
void IncBytesMoved(int level, uint64_t amount) {
comp_stats_[level].bytes_moved += amount;
}
void RecordLevelNSlowdown(int level, uint64_t micros, bool soft) {
if (soft) {
stall_leveln_slowdown_soft_[level] += micros;
@ -247,6 +283,13 @@ class InternalStats {
// another thread.
uint64_t write_other;
uint64_t write_self;
// Total number of keys written. write_self and write_other measure number
// of write requests written, Each of the write request can contain updates
// to multiple keys. num_keys_written is total number of keys updated by all
// those writes.
uint64_t num_keys_written;
// Total time writes delayed by stalls.
uint64_t write_stall_micros;
double seconds_up;
DBStatsSnapshot()
@ -256,6 +299,8 @@ class InternalStats {
write_with_wal(0),
write_other(0),
write_self(0),
num_keys_written(0),
write_stall_micros(0),
seconds_up(0) {}
} db_stats_snapshot_;
@ -272,4 +317,78 @@ class InternalStats {
const uint64_t started_at_;
};
#else
class InternalStats {
public:
enum InternalCFStatsType {
LEVEL0_SLOWDOWN,
MEMTABLE_COMPACTION,
LEVEL0_NUM_FILES,
WRITE_STALLS_ENUM_MAX,
BYTES_FLUSHED,
INTERNAL_CF_STATS_ENUM_MAX,
};
enum InternalDBStatsType {
WAL_FILE_BYTES,
WAL_FILE_SYNCED,
BYTES_WRITTEN,
NUMBER_KEYS_WRITTEN,
WRITE_DONE_BY_OTHER,
WRITE_DONE_BY_SELF,
WRITE_WITH_WAL,
WRITE_STALL_MICROS,
INTERNAL_DB_STATS_ENUM_MAX,
};
InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd) {}
struct CompactionStats {
uint64_t micros;
uint64_t bytes_readn;
uint64_t bytes_readnp1;
uint64_t bytes_written;
uint64_t bytes_moved;
int files_in_leveln;
int files_in_levelnp1;
int files_out_levelnp1;
uint64_t num_input_records;
uint64_t num_dropped_records;
int count;
explicit CompactionStats(int _count = 0) {}
explicit CompactionStats(const CompactionStats& c) {}
void Add(const CompactionStats& c) {}
void Subtract(const CompactionStats& c) {}
};
void AddCompactionStats(int level, const CompactionStats& stats) {}
void IncBytesMoved(int level, uint64_t amount) {}
void RecordLevelNSlowdown(int level, uint64_t micros, bool soft) {}
void AddCFStats(InternalCFStatsType type, uint64_t value) {}
void AddDBStats(InternalDBStatsType type, uint64_t value) {}
uint64_t GetBackgroundErrorCount() const { return 0; }
uint64_t BumpAndGetBackgroundErrorCount() { return 0; }
bool GetStringProperty(DBPropertyType property_type, const Slice& property,
std::string* value) { return false; }
bool GetIntProperty(DBPropertyType property_type, uint64_t* value,
DBImpl* db) const { return false; }
bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version,
uint64_t* value) const { return false; }
};
#endif // !ROCKSDB_LITE
} // namespace rocksdb

@ -0,0 +1,103 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <string>
#include <vector>
#include "db/column_family.h"
namespace rocksdb {
class MemTable;
struct JobContext {
inline bool HaveSomethingToDelete() const {
return full_scan_candidate_files.size() || sst_delete_files.size() ||
log_delete_files.size() || new_superversion != nullptr ||
superversions_to_free.size() > 0 || memtables_to_free.size() > 0;
}
// Structure to store information for candidate files to delete.
struct CandidateFileInfo {
std::string file_name;
uint32_t path_id;
CandidateFileInfo(std::string name, uint32_t path)
: file_name(std::move(name)), path_id(path) {}
bool operator==(const CandidateFileInfo& other) const {
return file_name == other.file_name && path_id == other.path_id;
}
};
// a list of all files that we'll consider deleting
// (every once in a while this is filled up with all files
// in the DB directory)
// (filled only if we're doing full scan)
std::vector<CandidateFileInfo> full_scan_candidate_files;
// the list of all live sst files that cannot be deleted
std::vector<FileDescriptor> sst_live;
// a list of sst files that we need to delete
std::vector<FileMetaData*> sst_delete_files;
// a list of log files that we need to delete
std::vector<uint64_t> log_delete_files;
// a list of memtables to be free
autovector<MemTable*> memtables_to_free;
autovector<SuperVersion*> superversions_to_free;
SuperVersion* new_superversion; // if nullptr no new superversion
// the current manifest_file_number, log_number and prev_log_number
// that corresponds to the set of files in 'live'.
uint64_t manifest_file_number;
uint64_t pending_manifest_file_number;
uint64_t log_number;
uint64_t prev_log_number;
uint64_t min_pending_output = 0;
explicit JobContext(bool create_superversion = false) {
manifest_file_number = 0;
pending_manifest_file_number = 0;
log_number = 0;
prev_log_number = 0;
new_superversion = create_superversion ? new SuperVersion() : nullptr;
}
void Clean() {
// free pending memtables
for (auto m : memtables_to_free) {
delete m;
}
// free superversions
for (auto s : superversions_to_free) {
delete s;
}
// if new_superversion was not used, it will be non-nullptr and needs
// to be freed here
delete new_superversion;
memtables_to_free.clear();
superversions_to_free.clear();
new_superversion = nullptr;
}
~JobContext() {
assert(memtables_to_free.size() == 0);
assert(superversions_to_free.size() == 0);
assert(new_superversion == nullptr);
}
};
} // namespace rocksdb

@ -0,0 +1,401 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "db/dbformat.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/perf_context.h"
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/table.h"
#include "rocksdb/options.h"
#include "rocksdb/table_properties.h"
#include "table/block_based_table_factory.h"
#include "table/plain_table_factory.h"
#include "util/hash.h"
#include "util/hash_linklist_rep.h"
#include "utilities/merge_operators.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/rate_limiter.h"
#include "util/statistics.h"
#include "util/testharness.h"
#include "util/sync_point.h"
#include "util/testutil.h"
#ifndef ROCKSDB_LITE
namespace rocksdb {
class EventListenerTest {
public:
EventListenerTest() {
dbname_ = test::TmpDir() + "/listener_test";
ASSERT_OK(DestroyDB(dbname_, Options()));
db_ = nullptr;
Reopen();
}
~EventListenerTest() {
Close();
Options options;
options.db_paths.emplace_back(dbname_, 0);
options.db_paths.emplace_back(dbname_ + "_2", 0);
options.db_paths.emplace_back(dbname_ + "_3", 0);
options.db_paths.emplace_back(dbname_ + "_4", 0);
ASSERT_OK(DestroyDB(dbname_, options));
}
void CreateColumnFamilies(const std::vector<std::string>& cfs,
const ColumnFamilyOptions* options = nullptr) {
ColumnFamilyOptions cf_opts;
cf_opts = ColumnFamilyOptions(Options());
size_t cfi = handles_.size();
handles_.resize(cfi + cfs.size());
for (auto cf : cfs) {
ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
}
}
void Close() {
for (auto h : handles_) {
delete h;
}
handles_.clear();
delete db_;
db_ = nullptr;
}
void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
const Options* options = nullptr) {
ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
}
Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
const Options* options = nullptr) {
Close();
Options opts = (options == nullptr) ? Options() : *options;
std::vector<const Options*> v_opts(cfs.size(), &opts);
return TryReopenWithColumnFamilies(cfs, v_opts);
}
Status TryReopenWithColumnFamilies(
const std::vector<std::string>& cfs,
const std::vector<const Options*>& options) {
Close();
ASSERT_EQ(cfs.size(), options.size());
std::vector<ColumnFamilyDescriptor> column_families;
for (size_t i = 0; i < cfs.size(); ++i) {
column_families.push_back(ColumnFamilyDescriptor(cfs[i], *options[i]));
}
DBOptions db_opts = DBOptions(*options[0]);
return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
}
Status TryReopen(Options* options = nullptr) {
Close();
Options opts;
if (options != nullptr) {
opts = *options;
} else {
opts.create_if_missing = true;
}
return DB::Open(opts, dbname_, &db_);
}
void Reopen(Options* options = nullptr) {
ASSERT_OK(TryReopen(options));
}
void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
const Options* options = nullptr) {
CreateColumnFamilies(cfs, options);
std::vector<std::string> cfs_plus_default = cfs;
cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
ReopenWithColumnFamilies(cfs_plus_default, options);
}
DBImpl* dbfull() {
return reinterpret_cast<DBImpl*>(db_);
}
Status Put(int cf, const Slice& k, const Slice& v,
WriteOptions wo = WriteOptions()) {
return db_->Put(wo, handles_[cf], k, v);
}
Status Flush(int cf = 0) {
FlushOptions opt = FlushOptions();
opt.wait = true;
if (cf == 0) {
return db_->Flush(opt);
} else {
return db_->Flush(opt, handles_[cf]);
}
}
DB* db_;
std::string dbname_;
std::vector<ColumnFamilyHandle*> handles_;
};
class TestCompactionListener : public EventListener {
public:
void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override {
compacted_dbs_.push_back(db);
}
std::vector<DB*> compacted_dbs_;
};
TEST(EventListenerTest, OnSingleDBCompactionTest) {
const int kTestKeySize = 16;
const int kTestValueSize = 984;
const int kEntrySize = kTestKeySize + kTestValueSize;
const int kEntriesPerBuffer = 100;
const int kNumL0Files = 4;
Options options;
options.create_if_missing = true;
options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
options.compaction_style = kCompactionStyleLevel;
options.target_file_size_base = options.write_buffer_size;
options.max_bytes_for_level_base = options.target_file_size_base * 2;
options.max_bytes_for_level_multiplier = 2;
options.compression = kNoCompression;
options.enable_thread_tracking = true;
options.level0_file_num_compaction_trigger = kNumL0Files;
TestCompactionListener* listener = new TestCompactionListener();
options.listeners.emplace_back(listener);
std::vector<std::string> cf_names = {
"pikachu", "ilya", "muromec", "dobrynia",
"nikitich", "alyosha", "popovich"};
CreateAndReopenWithCF(cf_names, &options);
ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
for (size_t i = 1; i < 8; ++i) {
ASSERT_OK(Flush(static_cast<int>(i)));
const Slice kStart = "a";
const Slice kEnd = "z";
ASSERT_OK(dbfull()->CompactRange(handles_[i], &kStart, &kEnd));
dbfull()->TEST_WaitForFlushMemTable();
dbfull()->TEST_WaitForCompact();
}
ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size());
for (size_t i = 0; i < cf_names.size(); ++i) {
ASSERT_EQ(listener->compacted_dbs_[i], db_);
}
}
class TestFlushListener : public EventListener {
public:
void OnFlushCompleted(
DB* db, const std::string& name,
const std::string& file_path,
bool triggered_writes_slowdown,
bool triggered_writes_stop) override {
flushed_dbs_.push_back(db);
flushed_column_family_names_.push_back(name);
if (triggered_writes_slowdown) {
slowdown_count++;
}
if (triggered_writes_stop) {
stop_count++;
}
}
std::vector<std::string> flushed_column_family_names_;
std::vector<DB*> flushed_dbs_;
int slowdown_count;
int stop_count;
};
TEST(EventListenerTest, OnSingleDBFlushTest) {
Options options;
options.write_buffer_size = 100000;
TestFlushListener* listener = new TestFlushListener();
options.listeners.emplace_back(listener);
std::vector<std::string> cf_names = {
"pikachu", "ilya", "muromec", "dobrynia",
"nikitich", "alyosha", "popovich"};
CreateAndReopenWithCF(cf_names, &options);
ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
for (size_t i = 1; i < 8; ++i) {
ASSERT_OK(Flush(static_cast<int>(i)));
dbfull()->TEST_WaitForFlushMemTable();
ASSERT_EQ(listener->flushed_dbs_.size(), i);
ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
}
// make sure call-back functions are called in the right order
for (size_t i = 0; i < cf_names.size(); ++i) {
ASSERT_EQ(listener->flushed_dbs_[i], db_);
ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
}
}
TEST(EventListenerTest, MultiCF) {
Options options;
options.write_buffer_size = 100000;
TestFlushListener* listener = new TestFlushListener();
options.listeners.emplace_back(listener);
std::vector<std::string> cf_names = {
"pikachu", "ilya", "muromec", "dobrynia",
"nikitich", "alyosha", "popovich"};
CreateAndReopenWithCF(cf_names, &options);
ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
for (size_t i = 1; i < 8; ++i) {
ASSERT_OK(Flush(static_cast<int>(i)));
ASSERT_EQ(listener->flushed_dbs_.size(), i);
ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
}
// make sure call-back functions are called in the right order
for (size_t i = 0; i < cf_names.size(); i++) {
ASSERT_EQ(listener->flushed_dbs_[i], db_);
ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
}
}
TEST(EventListenerTest, MultiDBMultiListeners) {
std::vector<TestFlushListener*> listeners;
const int kNumDBs = 5;
const int kNumListeners = 10;
for (int i = 0; i < kNumListeners; ++i) {
listeners.emplace_back(new TestFlushListener());
}
std::vector<std::string> cf_names = {
"pikachu", "ilya", "muromec", "dobrynia",
"nikitich", "alyosha", "popovich"};
Options options;
options.create_if_missing = true;
for (int i = 0; i < kNumListeners; ++i) {
options.listeners.emplace_back(listeners[i]);
}
DBOptions db_opts(options);
ColumnFamilyOptions cf_opts(options);
std::vector<DB*> dbs;
std::vector<std::vector<ColumnFamilyHandle *>> vec_handles;
for (int d = 0; d < kNumDBs; ++d) {
ASSERT_OK(DestroyDB(dbname_ + ToString(d), options));
DB* db;
std::vector<ColumnFamilyHandle*> handles;
ASSERT_OK(DB::Open(options, dbname_ + ToString(d), &db));
for (size_t c = 0; c < cf_names.size(); ++c) {
ColumnFamilyHandle* handle;
db->CreateColumnFamily(cf_opts, cf_names[c], &handle);
handles.push_back(handle);
}
vec_handles.push_back(std::move(handles));
dbs.push_back(db);
}
for (int d = 0; d < kNumDBs; ++d) {
for (size_t c = 0; c < cf_names.size(); ++c) {
ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c],
cf_names[c], cf_names[c]));
}
}
for (size_t c = 0; c < cf_names.size(); ++c) {
for (int d = 0; d < kNumDBs; ++d) {
ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c]));
reinterpret_cast<DBImpl*>(dbs[d])->TEST_WaitForFlushMemTable();
}
}
for (auto* listener : listeners) {
int pos = 0;
for (size_t c = 0; c < cf_names.size(); ++c) {
for (int d = 0; d < kNumDBs; ++d) {
ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]);
ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]);
pos++;
}
}
}
for (auto handles : vec_handles) {
for (auto h : handles) {
delete h;
}
handles.clear();
}
vec_handles.clear();
for (auto db : dbs) {
delete db;
}
}
TEST(EventListenerTest, DisableBGCompaction) {
Options options;
TestFlushListener* listener = new TestFlushListener();
const int kSlowdownTrigger = 5;
const int kStopTrigger = 10;
options.level0_slowdown_writes_trigger = kSlowdownTrigger;
options.level0_stop_writes_trigger = kStopTrigger;
options.listeners.emplace_back(listener);
// BG compaction is disabled. Number of L0 files will simply keeps
// increasing in this test.
options.compaction_style = kCompactionStyleNone;
options.compression = kNoCompression;
options.write_buffer_size = 100000; // Small write buffer
CreateAndReopenWithCF({"pikachu"}, &options);
WriteOptions wopts;
wopts.timeout_hint_us = 100000;
ColumnFamilyMetaData cf_meta;
db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
// keep writing until writes are forced to stop.
for (int i = 0; static_cast<int>(cf_meta.file_count) < kStopTrigger; ++i) {
Put(1, ToString(i), std::string(100000, 'x'), wopts);
db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
}
ASSERT_GE(listener->slowdown_count, kStopTrigger - kSlowdownTrigger);
ASSERT_GE(listener->stop_count, 1);
}
} // namespace rocksdb
#endif // ROCKSDB_LITE
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

@ -6,25 +6,33 @@
#include <vector>
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include "util/testharness.h"
#include "util/benchharness.h"
#include "db/version_set.h"
#include "db/write_controller.h"
#include "db/writebuffer.h"
#include "util/mutexlock.h"
namespace rocksdb {
std::string MakeKey(unsigned int num) {
std::string MakeKey(uint64_t num) {
char buf[30];
snprintf(buf, sizeof(buf), "%016u", num);
snprintf(buf, sizeof(buf), "%016" PRIu64, num);
return std::string(buf);
}
void BM_LogAndApply(int iters, int num_base_files) {
VersionSet* vset;
WriteController wc;
ColumnFamilyData* default_cfd;
uint64_t fnum = 1;
port::Mutex mu;
MutexLock l(&mu);
InstrumentedMutex mu;
InstrumentedMutexLock l(&mu);
BENCHMARK_SUSPEND {
std::string dbname = test::TmpDir() + "/rocksdb_test_benchmark";
@ -45,9 +53,10 @@ void BM_LogAndApply(int iters, int num_base_files) {
// Notice we are using the default options not through SanitizeOptions().
// We might want to initialize some options manually if needed.
options.db_paths.emplace_back(dbname, 0);
WriteBuffer wb(options.db_write_buffer_size);
// The parameter of table cache is passed in as null, so any file I/O
// operation is likely to fail.
vset = new VersionSet(dbname, &options, sopt, nullptr);
vset = new VersionSet(dbname, &options, sopt, nullptr, &wb, &wc);
std::vector<ColumnFamilyDescriptor> dummy;
dummy.push_back(ColumnFamilyDescriptor());
ASSERT_OK(vset->Recover(dummy));
@ -58,7 +67,8 @@ void BM_LogAndApply(int iters, int num_base_files) {
InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
vbase.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1);
}
ASSERT_OK(vset->LogAndApply(default_cfd, &vbase, &mu));
ASSERT_OK(vset->LogAndApply(default_cfd,
*default_cfd->GetLatestMutableCFOptions(), &vbase, &mu));
}
for (int i = 0; i < iters; i++) {
@ -67,8 +77,10 @@ void BM_LogAndApply(int iters, int num_base_files) {
InternalKey start(MakeKey(2 * fnum), 1, kTypeValue);
InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
vedit.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1);
vset->LogAndApply(default_cfd, &vedit, &mu);
vset->LogAndApply(default_cfd, *default_cfd->GetLatestMutableCFOptions(),
&vedit, &mu);
}
delete vset;
}
BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1)

@ -20,9 +20,9 @@ namespace log {
Reader::Reporter::~Reporter() {
}
Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
Reader::Reader(unique_ptr<SequentialFile>&& _file, Reporter* reporter,
bool checksum, uint64_t initial_offset)
: file_(std::move(file)),
: file_(std::move(_file)),
reporter_(reporter),
checksum_(checksum),
backing_store_(new char[kBlockSize]),
@ -32,8 +32,7 @@ Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
eof_offset_(0),
last_record_offset_(0),
end_of_buffer_offset_(0),
initial_offset_(initial_offset) {
}
initial_offset_(initial_offset) {}
Reader::~Reader() {
delete[] backing_store_;
@ -55,7 +54,7 @@ bool Reader::SkipToInitialBlock() {
if (block_start_location > 0) {
Status skip_status = file_->Skip(block_start_location);
if (!skip_status.ok()) {
ReportDrop(block_start_location, skip_status);
ReportDrop(static_cast<size_t>(block_start_location), skip_status);
return false;
}
}

@ -558,9 +558,9 @@ TEST(LogTest, ErrorJoinsRecords) {
ASSERT_EQ("correct", Read());
ASSERT_EQ("EOF", Read());
const unsigned int dropped = DroppedBytes();
ASSERT_LE(dropped, 2*kBlockSize + 100);
ASSERT_GE(dropped, 2*kBlockSize);
size_t dropped = DroppedBytes();
ASSERT_LE(dropped, 2 * kBlockSize + 100);
ASSERT_GE(dropped, 2 * kBlockSize);
}
TEST(LogTest, ReadStart) {

@ -15,6 +15,7 @@
#include "db/dbformat.h"
#include "db/merge_context.h"
#include "db/writebuffer.h"
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
@ -31,41 +32,63 @@
namespace rocksdb {
MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
MemTableOptions::MemTableOptions(
const ImmutableCFOptions& ioptions,
const MutableCFOptions& mutable_cf_options)
: write_buffer_size(mutable_cf_options.write_buffer_size),
arena_block_size(mutable_cf_options.arena_block_size),
memtable_prefix_bloom_bits(mutable_cf_options.memtable_prefix_bloom_bits),
memtable_prefix_bloom_probes(
mutable_cf_options.memtable_prefix_bloom_probes),
memtable_prefix_bloom_huge_page_tlb_size(
mutable_cf_options.memtable_prefix_bloom_huge_page_tlb_size),
inplace_update_support(ioptions.inplace_update_support),
inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
inplace_callback(ioptions.inplace_callback),
max_successive_merges(mutable_cf_options.max_successive_merges),
filter_deletes(mutable_cf_options.filter_deletes),
statistics(ioptions.statistics),
merge_operator(ioptions.merge_operator),
info_log(ioptions.info_log) {}
MemTable::MemTable(const InternalKeyComparator& cmp,
const ImmutableCFOptions& ioptions,
const MutableCFOptions& mutable_cf_options,
WriteBuffer* write_buffer)
: comparator_(cmp),
moptions_(ioptions, mutable_cf_options),
refs_(0),
kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)),
kWriteBufferSize(options.write_buffer_size),
arena_(options.arena_block_size),
table_(options.memtable_factory->CreateMemTableRep(
comparator_, &arena_, options.prefix_extractor.get(),
options.info_log.get())),
kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
arena_(moptions_.arena_block_size),
allocator_(&arena_, write_buffer),
table_(ioptions.memtable_factory->CreateMemTableRep(
comparator_, &allocator_, ioptions.prefix_extractor,
ioptions.info_log)),
num_entries_(0),
flush_in_progress_(false),
flush_completed_(false),
file_number_(0),
first_seqno_(0),
mem_next_logfile_number_(0),
locks_(options.inplace_update_support ? options.inplace_update_num_locks
: 0),
prefix_extractor_(options.prefix_extractor.get()),
should_flush_(ShouldFlushNow()) {
locks_(moptions_.inplace_update_support ?
moptions_.inplace_update_num_locks : 0),
prefix_extractor_(ioptions.prefix_extractor),
should_flush_(ShouldFlushNow()),
flush_scheduled_(false) {
// if should_flush_ == true without an entry inserted, something must have
// gone wrong already.
assert(!should_flush_);
if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
if (prefix_extractor_ && moptions_.memtable_prefix_bloom_bits > 0) {
prefix_bloom_.reset(new DynamicBloom(
&arena_,
options.memtable_prefix_bloom_bits, options.bloom_locality,
options.memtable_prefix_bloom_probes, nullptr,
options.memtable_prefix_bloom_huge_page_tlb_size,
options.info_log.get()));
&allocator_,
moptions_.memtable_prefix_bloom_bits, ioptions.bloom_locality,
moptions_.memtable_prefix_bloom_probes, nullptr,
moptions_.memtable_prefix_bloom_huge_page_tlb_size,
ioptions.info_log));
}
}
MemTable::~MemTable() {
assert(refs_ == 0);
}
MemTable::~MemTable() { assert(refs_ == 0); }
size_t MemTable::ApproximateMemoryUsage() {
size_t arena_usage = arena_.ApproximateMemoryUsage();
@ -97,14 +120,16 @@ bool MemTable::ShouldFlushNow() const {
// if we can still allocate one more block without exceeding the
// over-allocation ratio, then we should not flush.
if (allocated_memory + kArenaBlockSize <
kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
moptions_.write_buffer_size +
kArenaBlockSize * kAllowOverAllocationRatio) {
return false;
}
// if user keeps adding entries that exceeds kWriteBufferSize, we need to
// flush earlier even though we still have much available memory left.
if (allocated_memory >
kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
// if user keeps adding entries that exceeds moptions.write_buffer_size,
// we need to flush earlier even though we still have much available
// memory left.
if (allocated_memory > moptions_.write_buffer_size +
kArenaBlockSize * kAllowOverAllocationRatio) {
return true;
}
@ -158,7 +183,7 @@ Slice MemTableRep::UserKey(const char* key) const {
}
KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
*buf = arena_->Allocate(len);
*buf = allocator_->Allocate(len);
return static_cast<KeyHandle>(*buf);
}
@ -167,7 +192,7 @@ KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
// into this scratch space.
const char* EncodeKey(std::string* scratch, const Slice& target) {
scratch->clear();
PutVarint32(scratch, target.size());
PutVarint32(scratch, static_cast<uint32_t>(target.size()));
scratch->append(target.data(), target.size());
return scratch->data();
}
@ -175,12 +200,12 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {
class MemTableIterator: public Iterator {
public:
MemTableIterator(
const MemTable& mem, const ReadOptions& options, Arena* arena)
const MemTable& mem, const ReadOptions& read_options, Arena* arena)
: bloom_(nullptr),
prefix_extractor_(mem.prefix_extractor_),
valid_(false),
arena_mode_(arena != nullptr) {
if (prefix_extractor_ != nullptr && !options.total_order_seek) {
if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
bloom_ = mem.prefix_bloom_.get();
iter_ = mem.table_->GetDynamicPrefixIterator(arena);
} else {
@ -248,14 +273,10 @@ class MemTableIterator: public Iterator {
void operator=(const MemTableIterator&);
};
Iterator* MemTable::NewIterator(const ReadOptions& options, Arena* arena) {
if (arena == nullptr) {
return new MemTableIterator(*this, options, nullptr);
} else {
auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
return new (mem)
MemTableIterator(*this, options, arena);
}
Iterator* MemTable::NewIterator(const ReadOptions& read_options, Arena* arena) {
assert(arena != nullptr);
auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
return new (mem) MemTableIterator(*this, read_options, arena);
}
port::RWMutex* MemTable::GetLock(const Slice& key) {
@ -271,12 +292,12 @@ void MemTable::Add(SequenceNumber s, ValueType type,
// key bytes : char[internal_key.size()]
// value_size : varint32 of value.size()
// value bytes : char[value.size()]
size_t key_size = key.size();
size_t val_size = value.size();
size_t internal_key_size = key_size + 8;
const size_t encoded_len =
VarintLength(internal_key_size) + internal_key_size +
VarintLength(val_size) + val_size;
uint32_t key_size = static_cast<uint32_t>(key.size());
uint32_t val_size = static_cast<uint32_t>(value.size());
uint32_t internal_key_size = key_size + 8;
const uint32_t encoded_len = VarintLength(internal_key_size) +
internal_key_size + VarintLength(val_size) +
val_size;
char* buf = nullptr;
KeyHandle handle = table_->Allocate(encoded_len, &buf);
assert(buf != nullptr);
@ -399,7 +420,6 @@ static bool SaveValue(void* arg, const char* entry) {
*(s->found_final_value) = true;
return false;
}
std::string merge_result; // temporary area for merge results later
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
*(s->merge_in_progress) = true;
merge_context->PushOperand(v);
@ -416,13 +436,13 @@ static bool SaveValue(void* arg, const char* entry) {
}
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
MergeContext& merge_context, const Options& options) {
MergeContext* merge_context) {
// The sequence number is updated synchronously in version_set.h
if (first_seqno_ == 0) {
if (IsEmpty()) {
// Avoiding recording stats for speed.
return false;
}
PERF_TIMER_AUTO(get_from_memtable_time);
PERF_TIMER_GUARD(get_from_memtable_time);
Slice user_key = key.user_key();
bool found_final_value = false;
@ -440,11 +460,11 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
saver.value = value;
saver.status = s;
saver.mem = this;
saver.merge_context = &merge_context;
saver.merge_operator = options.merge_operator.get();
saver.logger = options.info_log.get();
saver.inplace_update_support = options.inplace_update_support;
saver.statistics = options.statistics.get();
saver.merge_context = merge_context;
saver.merge_operator = moptions_.merge_operator;
saver.logger = moptions_.info_log;
saver.inplace_update_support = moptions_.inplace_update_support;
saver.statistics = moptions_.statistics;
table_->Get(key, &saver, SaveValue);
}
@ -452,7 +472,6 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
if (!found_final_value && merge_in_progress) {
*s = Status::MergeInProgress("");
}
PERF_TIMER_STOP(get_from_memtable_time);
PERF_COUNTER_ADD(get_from_memtable_count, 1);
return found_final_value;
}
@ -487,8 +506,8 @@ void MemTable::Update(SequenceNumber seq,
switch (static_cast<ValueType>(tag & 0xff)) {
case kTypeValue: {
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
uint32_t prev_size = prev_value.size();
uint32_t new_size = value.size();
uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
uint32_t new_size = static_cast<uint32_t>(value.size());
// Update value, if new value size <= previous value size
if (new_size <= prev_size ) {
@ -517,8 +536,7 @@ void MemTable::Update(SequenceNumber seq,
bool MemTable::UpdateCallback(SequenceNumber seq,
const Slice& key,
const Slice& delta,
const Options& options) {
const Slice& delta) {
LookupKey lkey(key, seq);
Slice memkey = lkey.memtable_key();
@ -546,15 +564,15 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
switch (static_cast<ValueType>(tag & 0xff)) {
case kTypeValue: {
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
uint32_t prev_size = prev_value.size();
uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
char* prev_buffer = const_cast<char*>(prev_value.data());
uint32_t new_prev_size = prev_size;
uint32_t new_prev_size = prev_size;
std::string str_value;
WriteLock wl(GetLock(lkey.user_key()));
auto status = options.inplace_callback(prev_buffer, &new_prev_size,
delta, &str_value);
auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
delta, &str_value);
if (status == UpdateStatus::UPDATED_INPLACE) {
// Value already updated by callback.
assert(new_prev_size <= prev_size);
@ -567,12 +585,12 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
memcpy(p, prev_buffer, new_prev_size);
}
}
RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED);
RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
should_flush_ = ShouldFlushNow();
return true;
} else if (status == UpdateStatus::UPDATED) {
Add(seq, kTypeValue, key, Slice(str_value));
RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN);
RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
should_flush_ = ShouldFlushNow();
return true;
} else if (status == UpdateStatus::UPDATE_FAILED) {

@ -10,21 +10,48 @@
#pragma once
#include <string>
#include <memory>
#include <functional>
#include <deque>
#include <vector>
#include "db/dbformat.h"
#include "db/skiplist.h"
#include "db/version_edit.h"
#include "rocksdb/db.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/immutable_options.h"
#include "db/memtable_allocator.h"
#include "util/arena.h"
#include "util/dynamic_bloom.h"
#include "util/mutable_cf_options.h"
namespace rocksdb {
class Arena;
class Mutex;
class MemTableIterator;
class MergeContext;
class WriteBuffer;
struct MemTableOptions {
explicit MemTableOptions(
const ImmutableCFOptions& ioptions,
const MutableCFOptions& mutable_cf_options);
size_t write_buffer_size;
size_t arena_block_size;
uint32_t memtable_prefix_bloom_bits;
uint32_t memtable_prefix_bloom_probes;
size_t memtable_prefix_bloom_huge_page_tlb_size;
bool inplace_update_support;
size_t inplace_update_num_locks;
UpdateStatus (*inplace_callback)(char* existing_value,
uint32_t* existing_value_size,
Slice delta_value,
std::string* merged_value);
size_t max_successive_merges;
bool filter_deletes;
Statistics* statistics;
MergeOperator* merge_operator;
Logger* info_log;
};
class MemTable {
public:
@ -40,7 +67,9 @@ class MemTable {
// MemTables are reference counted. The initial reference count
// is zero and the caller must call Ref() at least once.
explicit MemTable(const InternalKeyComparator& comparator,
const Options& options);
const ImmutableCFOptions& ioptions,
const MutableCFOptions& mutable_cf_options,
WriteBuffer* write_buffer);
~MemTable();
@ -67,7 +96,11 @@ class MemTable {
// This method heuristically determines if the memtable should continue to
// host more data.
bool ShouldFlush() const { return should_flush_; }
bool ShouldScheduleFlush() const {
return flush_scheduled_ == false && should_flush_;
}
void MarkFlushScheduled() { flush_scheduled_ = true; }
// Return an iterator that yields the contents of the memtable.
//
@ -81,8 +114,7 @@ class MemTable {
// arena: If not null, the arena needs to be used to allocate the Iterator.
// Calling ~Iterator of the iterator will destroy all the states but
// those allocated in arena.
Iterator* NewIterator(const ReadOptions& options,
Arena* arena = nullptr);
Iterator* NewIterator(const ReadOptions& read_options, Arena* arena);
// Add an entry into memtable that maps key to value at the
// specified sequence number and with the specified type.
@ -100,7 +132,7 @@ class MemTable {
// store MergeInProgress in s, and return false.
// Else, return false.
bool Get(const LookupKey& key, std::string* value, Status* s,
MergeContext& merge_context, const Options& options);
MergeContext* merge_context);
// Attempts to update the new_value inplace, else does normal Add
// Pseudocode
@ -124,8 +156,7 @@ class MemTable {
// else return false
bool UpdateCallback(SequenceNumber seq,
const Slice& key,
const Slice& delta,
const Options& options);
const Slice& delta);
// Returns the number of successive merge entries starting from the newest
// entry for the key up to the last non-merge entry or last entry for the
@ -138,6 +169,9 @@ class MemTable {
// Returns the edits area that is needed for flushing the memtable
VersionEdit* GetEdits() { return &edit_; }
// Returns if there is no entry inserted to the mem table.
bool IsEmpty() const { return first_seqno_ == 0; }
// Returns the sequence number of the first element that was inserted
// into the memtable
SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
@ -151,7 +185,10 @@ class MemTable {
void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
// Notify the underlying storage that no more items will be added
void MarkImmutable() { table_->MarkReadOnly(); }
void MarkImmutable() {
table_->MarkReadOnly();
allocator_.DoneAllocating();
}
// return true if the current MemTableRep supports merge operator.
bool IsMergeOperatorSupported() const {
@ -159,7 +196,10 @@ class MemTable {
}
// return true if the current MemTableRep supports snapshots.
bool IsSnapshotSupported() const { return table_->IsSnapshotSupported(); }
// inplace update prevents snapshots,
bool IsSnapshotSupported() const {
return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
}
// Get the lock associated for the key
port::RWMutex* GetLock(const Slice& key);
@ -168,10 +208,10 @@ class MemTable {
return comparator_.comparator;
}
const Arena& TEST_GetArena() const { return arena_; }
const MemTableOptions* GetMemTableOptions() const { return &moptions_; }
private:
// Dynamically check if we can add more incoming entries.
// Dynamically check if we can add more incoming entries
bool ShouldFlushNow() const;
friend class MemTableIterator;
@ -179,10 +219,11 @@ class MemTable {
friend class MemTableList;
KeyComparator comparator_;
const MemTableOptions moptions_;
int refs_;
const size_t kArenaBlockSize;
const size_t kWriteBufferSize;
Arena arena_;
MemTableAllocator allocator_;
unique_ptr<MemTableRep> table_;
uint64_t num_entries_;
@ -214,6 +255,9 @@ class MemTable {
// a flag indicating if a memtable has met the criteria to flush
bool should_flush_;
// a flag indicating if flush has been scheduled
bool flush_scheduled_;
};
extern const char* EncodeKey(std::string* scratch, const Slice& target);

@ -0,0 +1,52 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <assert.h>
#include "db/memtable_allocator.h"
#include "db/writebuffer.h"
#include "util/arena.h"
namespace rocksdb {
MemTableAllocator::MemTableAllocator(Arena* arena, WriteBuffer* write_buffer)
: arena_(arena), write_buffer_(write_buffer), bytes_allocated_(0) {
}
MemTableAllocator::~MemTableAllocator() {
DoneAllocating();
}
char* MemTableAllocator::Allocate(size_t bytes) {
assert(write_buffer_ != nullptr);
bytes_allocated_ += bytes;
write_buffer_->ReserveMem(bytes);
return arena_->Allocate(bytes);
}
char* MemTableAllocator::AllocateAligned(size_t bytes, size_t huge_page_size,
Logger* logger) {
assert(write_buffer_ != nullptr);
bytes_allocated_ += bytes;
write_buffer_->ReserveMem(bytes);
return arena_->AllocateAligned(bytes, huge_page_size, logger);
}
void MemTableAllocator::DoneAllocating() {
if (write_buffer_ != nullptr) {
write_buffer_->FreeMem(bytes_allocated_);
write_buffer_ = nullptr;
}
}
size_t MemTableAllocator::BlockSize() const {
return arena_->BlockSize();
}
} // namespace rocksdb

@ -0,0 +1,47 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// This is used by the MemTable to allocate write buffer memory. It connects
// to WriteBuffer so we can track and enforce overall write buffer limits.
#pragma once
#include "util/allocator.h"
namespace rocksdb {
class Arena;
class Logger;
class WriteBuffer;
class MemTableAllocator : public Allocator {
public:
explicit MemTableAllocator(Arena* arena, WriteBuffer* write_buffer);
~MemTableAllocator();
// Allocator interface
char* Allocate(size_t bytes) override;
char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
Logger* logger = nullptr) override;
size_t BlockSize() const override;
// Call when we're finished allocating memory so we can free it from
// the write buffer's limit.
void DoneAllocating();
private:
Arena* arena_;
WriteBuffer* write_buffer_;
size_t bytes_allocated_;
// No copying allowed
MemTableAllocator(const MemTableAllocator&);
void operator=(const MemTableAllocator&);
};
} // namespace rocksdb

@ -5,6 +5,11 @@
//
#include "db/memtable_list.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <string>
#include "rocksdb/db.h"
#include "db/memtable.h"
@ -62,10 +67,9 @@ int MemTableList::size() const {
// Return the most recent value found, if any.
// Operands stores the list of merge operations to apply, so far.
bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
Status* s, MergeContext& merge_context,
const Options& options) {
Status* s, MergeContext* merge_context) {
for (auto& memtable : memlist_) {
if (memtable->Get(key, value, s, merge_context, options)) {
if (memtable->Get(key, value, s, merge_context)) {
return true;
}
}
@ -73,9 +77,10 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
}
void MemTableListVersion::AddIterators(const ReadOptions& options,
std::vector<Iterator*>* iterator_list) {
std::vector<Iterator*>* iterator_list,
Arena* arena) {
for (auto& m : memlist_) {
iterator_list->push_back(m->NewIterator(options));
iterator_list->push_back(m->NewIterator(options, arena));
}
}
@ -114,7 +119,7 @@ void MemTableListVersion::Remove(MemTable* m) {
bool MemTableList::IsFlushPending() const {
if ((flush_requested_ && num_flush_not_started_ >= 1) ||
(num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
assert(imm_flush_needed.NoBarrier_Load() != nullptr);
assert(imm_flush_needed.load(std::memory_order_relaxed));
return true;
}
return false;
@ -129,7 +134,7 @@ void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
assert(!m->flush_completed_);
num_flush_not_started_--;
if (num_flush_not_started_ == 0) {
imm_flush_needed.Release_Store(nullptr);
imm_flush_needed.store(false, std::memory_order_release);
}
m->flush_in_progress_ = true; // flushing will start very soon
ret->push_back(m);
@ -139,8 +144,7 @@ void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
}
void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
uint64_t file_number,
FileNumToPathIdMap* pending_outputs) {
uint64_t file_number) {
assert(!mems.empty());
// If the flush was not successful, then just reset state.
@ -154,15 +158,14 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
m->edit_.Clear();
num_flush_not_started_++;
}
pending_outputs->erase(file_number);
imm_flush_needed.Release_Store(reinterpret_cast<void *>(1));
imm_flush_needed.store(true, std::memory_order_release);
}
// Record a successful flush in the manifest file
Status MemTableList::InstallMemtableFlushResults(
ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
port::Mutex* mu, Logger* info_log, uint64_t file_number,
FileNumToPathIdMap* pending_outputs, autovector<MemTable*>* to_delete,
ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
const autovector<MemTable*>& mems, VersionSet* vset, InstrumentedMutex* mu,
uint64_t file_number, autovector<MemTable*>* to_delete,
Directory* db_directory, LogBuffer* log_buffer) {
mu->AssertHeld();
@ -193,11 +196,11 @@ Status MemTableList::InstallMemtableFlushResults(
break;
}
LogToBuffer(log_buffer, "[%s] Level-0 commit table #%lu started",
cfd->GetName().c_str(), (unsigned long)m->file_number_);
LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64 " started",
cfd->GetName().c_str(), m->file_number_);
// this can release and reacquire the mutex.
s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory);
s = vset->LogAndApply(cfd, mutable_cf_options, &m->edit_, mu, db_directory);
// we will be changing the version in the next code path,
// so we better create a new one, since versions are immutable
@ -208,34 +211,26 @@ Status MemTableList::InstallMemtableFlushResults(
uint64_t mem_id = 1; // how many memtables has been flushed.
do {
if (s.ok()) { // commit new state
LogToBuffer(log_buffer,
"[%s] Level-0 commit table #%lu: memtable #%lu done",
cfd->GetName().c_str(), (unsigned long)m->file_number_,
(unsigned long)mem_id);
LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64
": memtable #%" PRIu64 " done",
cfd->GetName().c_str(), m->file_number_, mem_id);
current_->Remove(m);
assert(m->file_number_ > 0);
// pending_outputs can be cleared only after the newly created file
// has been written to a committed version so that other concurrently
// executing compaction threads do not mistakenly assume that this
// file is not live.
pending_outputs->erase(m->file_number_);
if (m->Unref() != nullptr) {
to_delete->push_back(m);
}
} else {
//commit failed. setup state so that we can flush again.
Log(info_log,
"Level-0 commit table #%lu: memtable #%lu failed",
(unsigned long)m->file_number_,
(unsigned long)mem_id);
LogToBuffer(log_buffer, "Level-0 commit table #%" PRIu64
": memtable #%" PRIu64 " failed",
m->file_number_, mem_id);
m->flush_completed_ = false;
m->flush_in_progress_ = false;
m->edit_.Clear();
num_flush_not_started_++;
pending_outputs->erase(m->file_number_);
m->file_number_ = 0;
imm_flush_needed.Release_Store((void *)1);
imm_flush_needed.store(true, std::memory_order_release);
}
++mem_id;
} while (!current_->memlist_.empty() && (m = current_->memlist_.back()) &&
@ -258,17 +253,17 @@ void MemTableList::Add(MemTable* m) {
m->MarkImmutable();
num_flush_not_started_++;
if (num_flush_not_started_ == 1) {
imm_flush_needed.Release_Store((void *)1);
imm_flush_needed.store(true, std::memory_order_release);
}
}
// Returns an estimate of the number of bytes of data in use.
size_t MemTableList::ApproximateMemoryUsage() {
size_t size = 0;
size_t total_size = 0;
for (auto& memtable : current_->memlist_) {
size += memtable->ApproximateMemoryUsage();
total_size += memtable->ApproximateMemoryUsage();
}
return size;
return total_size;
}
void MemTableList::InstallNewVersion() {

@ -22,13 +22,14 @@
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "util/autovector.h"
#include "util/instrumented_mutex.h"
#include "util/log_buffer.h"
namespace rocksdb {
class ColumnFamilyData;
class InternalKeyComparator;
class Mutex;
class InstrumentedMutex;
class MergeIteratorBuilder;
// keeps a list of immutable memtables in a vector. the list is immutable
@ -46,10 +47,10 @@ class MemTableListVersion {
// Search all the memtables starting from the most recent one.
// Return the most recent value found, if any.
bool Get(const LookupKey& key, std::string* value, Status* s,
MergeContext& merge_context, const Options& options);
MergeContext* merge_context);
void AddIterators(const ReadOptions& options,
std::vector<Iterator*>* iterator_list);
std::vector<Iterator*>* iterator_list, Arena* arena);
void AddIterators(const ReadOptions& options,
MergeIteratorBuilder* merge_iter_builder);
@ -78,12 +79,12 @@ class MemTableList {
public:
// A list of memtables.
explicit MemTableList(int min_write_buffer_number_to_merge)
: min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
: imm_flush_needed(false),
min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
current_(new MemTableListVersion()),
num_flush_not_started_(0),
commit_in_progress_(false),
flush_requested_(false) {
imm_flush_needed.Release_Store(nullptr);
current_->Ref();
}
~MemTableList() {}
@ -92,7 +93,7 @@ class MemTableList {
// so that background threads can detect non-nullptr pointer to
// determine whether there is anything more to start flushing.
port::AtomicPointer imm_flush_needed;
std::atomic<bool> imm_flush_needed;
// Returns the total number of memtables in the list
int size() const;
@ -108,14 +109,13 @@ class MemTableList {
// Reset status of the given memtable list back to pending state so that
// they can get picked up again on the next round of flush.
void RollbackMemtableFlush(const autovector<MemTable*>& mems,
uint64_t file_number,
FileNumToPathIdMap* pending_outputs);
uint64_t file_number);
// Commit a successful flush in the manifest file
Status InstallMemtableFlushResults(
ColumnFamilyData* cfd, const autovector<MemTable*>& m, VersionSet* vset,
port::Mutex* mu, Logger* info_log, uint64_t file_number,
FileNumToPathIdMap* pending_outputs, autovector<MemTable*>* to_delete,
ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
const autovector<MemTable*>& m, VersionSet* vset, InstrumentedMutex* mu,
uint64_t file_number, autovector<MemTable*>* to_delete,
Directory* db_directory, LogBuffer* log_buffer);
// New memtables are inserted at the front of the list.

@ -0,0 +1,696 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#define __STDC_FORMAT_MACROS
#ifndef GFLAGS
#include <cstdio>
int main() {
fprintf(stderr, "Please install gflags to run rocksdb tools\n");
return 1;
}
#else
#include <gflags/gflags.h>
#include <atomic>
#include <iostream>
#include <memory>
#include <thread>
#include <type_traits>
#include <vector>
#include "db/dbformat.h"
#include "db/memtable.h"
#include "db/writebuffer.h"
#include "port/port.h"
#include "port/stack_trace.h"
#include "rocksdb/comparator.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/options.h"
#include "rocksdb/slice_transform.h"
#include "util/arena.h"
#include "util/mutexlock.h"
#include "util/stop_watch.h"
#include "util/testutil.h"
using GFLAGS::ParseCommandLineFlags;
using GFLAGS::RegisterFlagValidator;
using GFLAGS::SetUsageMessage;
DEFINE_string(benchmarks, "fillrandom",
"Comma-separated list of benchmarks to run. Options:\n"
"\tfillrandom -- write N random values\n"
"\tfillseq -- write N values in sequential order\n"
"\treadrandom -- read N values in random order\n"
"\treadseq -- scan the DB\n"
"\treadwrite -- 1 thread writes while N - 1 threads "
"do random\n"
"\t reads\n"
"\tseqreadwrite -- 1 thread writes while N - 1 threads "
"do scans\n");
DEFINE_string(memtablerep, "skiplist",
"Which implementation of memtablerep to use. See "
"include/memtablerep.h for\n"
" more details. Options:\n"
"\tskiplist -- backed by a skiplist\n"
"\tvector -- backed by an std::vector\n"
"\thashskiplist -- backed by a hash skip list\n"
"\thashlinklist -- backed by a hash linked list\n"
"\tcuckoo -- backed by a cuckoo hash table");
DEFINE_int64(bucket_count, 1000000,
"bucket_count parameter to pass into NewHashSkiplistRepFactory or "
"NewHashLinkListRepFactory");
DEFINE_int32(
hashskiplist_height, 4,
"skiplist_height parameter to pass into NewHashSkiplistRepFactory");
DEFINE_int32(
hashskiplist_branching_factor, 4,
"branching_factor parameter to pass into NewHashSkiplistRepFactory");
DEFINE_int32(
huge_page_tlb_size, 0,
"huge_page_tlb_size parameter to pass into NewHashLinkListRepFactory");
DEFINE_int32(bucket_entries_logging_threshold, 4096,
"bucket_entries_logging_threshold parameter to pass into "
"NewHashLinkListRepFactory");
DEFINE_bool(if_log_bucket_dist_when_flash, true,
"if_log_bucket_dist_when_flash parameter to pass into "
"NewHashLinkListRepFactory");
DEFINE_int32(
threshold_use_skiplist, 256,
"threshold_use_skiplist parameter to pass into NewHashLinkListRepFactory");
DEFINE_int64(
write_buffer_size, 256,
"write_buffer_size parameter to pass into NewHashCuckooRepFactory");
DEFINE_int64(
average_data_size, 64,
"average_data_size parameter to pass into NewHashCuckooRepFactory");
DEFINE_int64(
hash_function_count, 4,
"hash_function_count parameter to pass into NewHashCuckooRepFactory");
DEFINE_int32(
num_threads, 1,
"Number of concurrent threads to run. If the benchmark includes writes,\n"
"then at most one thread will be a writer");
DEFINE_int32(num_operations, 1000000,
"Number of operations to do for write and random read benchmarks");
DEFINE_int32(num_scans, 10,
"Number of times for each thread to scan the memtablerep for "
"sequential read "
"benchmarks");
DEFINE_int32(item_size, 100, "Number of bytes each item should be");
DEFINE_int32(prefix_length, 8,
"Prefix length to pass into NewFixedPrefixTransform");
/* VectorRep settings */
DEFINE_int64(vectorrep_count, 0,
"Number of entries to reserve on VectorRep initialization");
DEFINE_int64(seed, 0,
"Seed base for random number generators. "
"When 0 it is deterministic.");
static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
namespace rocksdb {
namespace {
struct CallbackVerifyArgs {
bool found;
LookupKey* key;
MemTableRep* table;
InternalKeyComparator* comparator;
};
} // namespace
// Helper for quickly generating random data.
class RandomGenerator {
private:
std::string data_;
unsigned int pos_;
public:
RandomGenerator() {
Random rnd(301);
auto size = (unsigned)std::max(1048576, FLAGS_item_size);
test::RandomString(&rnd, size, &data_);
pos_ = 0;
}
Slice Generate(unsigned int len) {
assert(len <= data_.size());
if (pos_ + len > data_.size()) {
pos_ = 0;
}
pos_ += len;
return Slice(data_.data() + pos_ - len, len);
}
};
enum WriteMode { SEQUENTIAL, RANDOM, UNIQUE_RANDOM };
class KeyGenerator {
public:
KeyGenerator(Random64* rand, WriteMode mode, uint64_t num)
: rand_(rand), mode_(mode), num_(num), next_(0) {
if (mode_ == UNIQUE_RANDOM) {
// NOTE: if memory consumption of this approach becomes a concern,
// we can either break it into pieces and only random shuffle a section
// each time. Alternatively, use a bit map implementation
// (https://reviews.facebook.net/differential/diff/54627/)
values_.resize(num_);
for (uint64_t i = 0; i < num_; ++i) {
values_[i] = i;
}
std::shuffle(
values_.begin(), values_.end(),
std::default_random_engine(static_cast<unsigned int>(FLAGS_seed)));
}
}
uint64_t Next() {
switch (mode_) {
case SEQUENTIAL:
return next_++;
case RANDOM:
return rand_->Next() % num_;
case UNIQUE_RANDOM:
return values_[next_++];
}
assert(false);
return std::numeric_limits<uint64_t>::max();
}
private:
Random64* rand_;
WriteMode mode_;
const uint64_t num_;
uint64_t next_;
std::vector<uint64_t> values_;
};
class BenchmarkThread {
public:
explicit BenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
uint64_t* bytes_written, uint64_t* bytes_read,
uint64_t* sequence, uint64_t num_ops,
uint64_t* read_hits)
: table_(table),
key_gen_(key_gen),
bytes_written_(bytes_written),
bytes_read_(bytes_read),
sequence_(sequence),
num_ops_(num_ops),
read_hits_(read_hits) {}
virtual void operator()() = 0;
virtual ~BenchmarkThread() {}
protected:
MemTableRep* table_;
KeyGenerator* key_gen_;
uint64_t* bytes_written_;
uint64_t* bytes_read_;
uint64_t* sequence_;
uint64_t num_ops_;
uint64_t* read_hits_;
RandomGenerator generator_;
};
class FillBenchmarkThread : public BenchmarkThread {
public:
FillBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
uint64_t* bytes_written, uint64_t* bytes_read,
uint64_t* sequence, uint64_t num_ops, uint64_t* read_hits)
: BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
num_ops, read_hits) {}
void FillOne() {
char* buf = nullptr;
auto internal_key_size = 16;
auto encoded_len =
FLAGS_item_size + VarintLength(internal_key_size) + internal_key_size;
KeyHandle handle = table_->Allocate(encoded_len, &buf);
assert(buf != nullptr);
char* p = EncodeVarint32(buf, internal_key_size);
auto key = key_gen_->Next();
EncodeFixed64(p, key);
p += 8;
EncodeFixed64(p, ++(*sequence_));
p += 8;
Slice bytes = generator_.Generate(FLAGS_item_size);
memcpy(p, bytes.data(), FLAGS_item_size);
p += FLAGS_item_size;
assert(p == buf + encoded_len);
table_->Insert(handle);
*bytes_written_ += encoded_len;
}
void operator()() override {
for (unsigned int i = 0; i < num_ops_; ++i) {
FillOne();
}
}
};
class ConcurrentFillBenchmarkThread : public FillBenchmarkThread {
public:
ConcurrentFillBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
uint64_t* bytes_written, uint64_t* bytes_read,
uint64_t* sequence, uint64_t num_ops,
uint64_t* read_hits,
std::atomic_int* threads_done)
: FillBenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
num_ops, read_hits) {
threads_done_ = threads_done;
}
void operator()() override {
// # of read threads will be total threads - write threads (always 1). Loop
// while all reads complete.
while ((*threads_done_).load() < (FLAGS_num_threads - 1)) {
FillOne();
}
}
private:
std::atomic_int* threads_done_;
};
class ReadBenchmarkThread : public BenchmarkThread {
public:
ReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
uint64_t* bytes_written, uint64_t* bytes_read,
uint64_t* sequence, uint64_t num_ops, uint64_t* read_hits)
: BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
num_ops, read_hits) {}
static bool callback(void* arg, const char* entry) {
CallbackVerifyArgs* callback_args = static_cast<CallbackVerifyArgs*>(arg);
assert(callback_args != nullptr);
uint32_t key_length;
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if ((callback_args->comparator)->user_comparator()->Compare(
Slice(key_ptr, key_length - 8), callback_args->key->user_key()) ==
0) {
callback_args->found = true;
}
return false;
}
void ReadOne() {
std::string user_key;
auto key = key_gen_->Next();
PutFixed64(&user_key, key);
LookupKey lookup_key(user_key, *sequence_);
InternalKeyComparator internal_key_comp(BytewiseComparator());
CallbackVerifyArgs verify_args;
verify_args.found = false;
verify_args.key = &lookup_key;
verify_args.table = table_;
verify_args.comparator = &internal_key_comp;
table_->Get(lookup_key, &verify_args, callback);
if (verify_args.found) {
*bytes_read_ += VarintLength(16) + 16 + FLAGS_item_size;
++*read_hits_;
}
}
void operator()() override {
for (unsigned int i = 0; i < num_ops_; ++i) {
ReadOne();
}
}
};
class SeqReadBenchmarkThread : public BenchmarkThread {
public:
SeqReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
uint64_t* bytes_written, uint64_t* bytes_read,
uint64_t* sequence, uint64_t num_ops,
uint64_t* read_hits)
: BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
num_ops, read_hits) {}
void ReadOneSeq() {
std::unique_ptr<MemTableRep::Iterator> iter(table_->GetIterator());
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
// pretend to read the value
*bytes_read_ += VarintLength(16) + 16 + FLAGS_item_size;
}
++*read_hits_;
}
void operator()() override {
for (unsigned int i = 0; i < num_ops_; ++i) {
{ ReadOneSeq(); }
}
}
};
class ConcurrentReadBenchmarkThread : public ReadBenchmarkThread {
public:
ConcurrentReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
uint64_t* bytes_written, uint64_t* bytes_read,
uint64_t* sequence, uint64_t num_ops,
uint64_t* read_hits,
std::atomic_int* threads_done)
: ReadBenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
num_ops, read_hits) {
threads_done_ = threads_done;
}
void operator()() override {
for (unsigned int i = 0; i < num_ops_; ++i) {
ReadOne();
}
++*threads_done_;
}
private:
std::atomic_int* threads_done_;
};
class SeqConcurrentReadBenchmarkThread : public SeqReadBenchmarkThread {
public:
SeqConcurrentReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
uint64_t* bytes_written,
uint64_t* bytes_read, uint64_t* sequence,
uint64_t num_ops, uint64_t* read_hits,
std::atomic_int* threads_done)
: SeqReadBenchmarkThread(table, key_gen, bytes_written, bytes_read,
sequence, num_ops, read_hits) {
threads_done_ = threads_done;
}
void operator()() override {
for (unsigned int i = 0; i < num_ops_; ++i) {
ReadOneSeq();
}
++*threads_done_;
}
private:
std::atomic_int* threads_done_;
};
class Benchmark {
public:
explicit Benchmark(MemTableRep* table, KeyGenerator* key_gen,
uint64_t* sequence, uint32_t num_threads)
: table_(table),
key_gen_(key_gen),
sequence_(sequence),
num_threads_(num_threads) {}
virtual ~Benchmark() {}
virtual void Run() {
std::cout << "Number of threads: " << num_threads_ << std::endl;
std::vector<std::thread> threads;
uint64_t bytes_written = 0;
uint64_t bytes_read = 0;
uint64_t read_hits = 0;
StopWatchNano timer(Env::Default(), true);
RunThreads(&threads, &bytes_written, &bytes_read, true, &read_hits);
auto elapsed_time = static_cast<double>(timer.ElapsedNanos() / 1000);
std::cout << "Elapsed time: " << static_cast<int>(elapsed_time) << " us"
<< std::endl;
if (bytes_written > 0) {
auto MiB_written = static_cast<double>(bytes_written) / (1 << 20);
auto write_throughput = MiB_written / (elapsed_time / 1000000);
std::cout << "Total bytes written: " << MiB_written << " MiB"
<< std::endl;
std::cout << "Write throughput: " << write_throughput << " MiB/s"
<< std::endl;
auto us_per_op = elapsed_time / num_write_ops_per_thread_;
std::cout << "write us/op: " << us_per_op << std::endl;
}
if (bytes_read > 0) {
auto MiB_read = static_cast<double>(bytes_read) / (1 << 20);
auto read_throughput = MiB_read / (elapsed_time / 1000000);
std::cout << "Total bytes read: " << MiB_read << " MiB" << std::endl;
std::cout << "Read throughput: " << read_throughput << " MiB/s"
<< std::endl;
auto us_per_op = elapsed_time / num_read_ops_per_thread_;
std::cout << "read us/op: " << us_per_op << std::endl;
}
}
virtual void RunThreads(std::vector<std::thread>* threads,
uint64_t* bytes_written, uint64_t* bytes_read,
bool write, uint64_t* read_hits) = 0;
protected:
MemTableRep* table_;
KeyGenerator* key_gen_;
uint64_t* sequence_;
uint64_t num_write_ops_per_thread_;
uint64_t num_read_ops_per_thread_;
const uint32_t num_threads_;
};
class FillBenchmark : public Benchmark {
public:
explicit FillBenchmark(MemTableRep* table, KeyGenerator* key_gen,
uint64_t* sequence)
: Benchmark(table, key_gen, sequence, 1) {
num_write_ops_per_thread_ = FLAGS_num_operations;
}
void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
uint64_t* bytes_read, bool write,
uint64_t* read_hits) override {
FillBenchmarkThread(table_, key_gen_, bytes_written, bytes_read, sequence_,
num_write_ops_per_thread_, read_hits)();
}
};
class ReadBenchmark : public Benchmark {
public:
explicit ReadBenchmark(MemTableRep* table, KeyGenerator* key_gen,
uint64_t* sequence)
: Benchmark(table, key_gen, sequence, FLAGS_num_threads) {
num_read_ops_per_thread_ = FLAGS_num_operations / FLAGS_num_threads;
}
void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
uint64_t* bytes_read, bool write,
uint64_t* read_hits) override {
for (int i = 0; i < FLAGS_num_threads; ++i) {
threads->emplace_back(
ReadBenchmarkThread(table_, key_gen_, bytes_written, bytes_read,
sequence_, num_read_ops_per_thread_, read_hits));
}
for (auto& thread : *threads) {
thread.join();
}
std::cout << "read hit%: "
<< (static_cast<double>(*read_hits) / FLAGS_num_operations) * 100
<< std::endl;
}
};
class SeqReadBenchmark : public Benchmark {
public:
explicit SeqReadBenchmark(MemTableRep* table, uint64_t* sequence)
: Benchmark(table, nullptr, sequence, FLAGS_num_threads) {
num_read_ops_per_thread_ = FLAGS_num_scans;
}
void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
uint64_t* bytes_read, bool write,
uint64_t* read_hits) override {
for (int i = 0; i < FLAGS_num_threads; ++i) {
threads->emplace_back(SeqReadBenchmarkThread(
table_, key_gen_, bytes_written, bytes_read, sequence_,
num_read_ops_per_thread_, read_hits));
}
for (auto& thread : *threads) {
thread.join();
}
}
};
template <class ReadThreadType>
class ReadWriteBenchmark : public Benchmark {
public:
explicit ReadWriteBenchmark(MemTableRep* table, KeyGenerator* key_gen,
uint64_t* sequence)
: Benchmark(table, key_gen, sequence, FLAGS_num_threads) {
num_read_ops_per_thread_ =
FLAGS_num_threads <= 1
? 0
: (FLAGS_num_operations / (FLAGS_num_threads - 1));
num_write_ops_per_thread_ = FLAGS_num_operations;
}
void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
uint64_t* bytes_read, bool write,
uint64_t* read_hits) override {
std::atomic_int threads_done;
threads_done.store(0);
threads->emplace_back(ConcurrentFillBenchmarkThread(
table_, key_gen_, bytes_written, bytes_read, sequence_,
num_write_ops_per_thread_, read_hits, &threads_done));
for (int i = 1; i < FLAGS_num_threads; ++i) {
threads->emplace_back(
ReadThreadType(table_, key_gen_, bytes_written, bytes_read, sequence_,
num_read_ops_per_thread_, read_hits, &threads_done));
}
for (auto& thread : *threads) {
thread.join();
}
}
};
} // namespace rocksdb
void PrintWarnings() {
#if defined(__GNUC__) && !defined(__OPTIMIZE__)
fprintf(stdout,
"WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
#endif
#ifndef NDEBUG
fprintf(stdout,
"WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
}
int main(int argc, char** argv) {
rocksdb::port::InstallStackTraceHandler();
SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
" [OPTIONS]...");
ParseCommandLineFlags(&argc, &argv, true);
PrintWarnings();
rocksdb::Options options;
std::unique_ptr<rocksdb::MemTableRepFactory> factory;
if (FLAGS_memtablerep == "skiplist") {
factory.reset(new rocksdb::SkipListFactory);
} else if (FLAGS_memtablerep == "vector") {
factory.reset(new rocksdb::VectorRepFactory);
} else if (FLAGS_memtablerep == "hashskiplist") {
factory.reset(rocksdb::NewHashSkipListRepFactory(
FLAGS_bucket_count, FLAGS_hashskiplist_height,
FLAGS_hashskiplist_branching_factor));
options.prefix_extractor.reset(
rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
} else if (FLAGS_memtablerep == "hashlinklist") {
factory.reset(rocksdb::NewHashLinkListRepFactory(
FLAGS_bucket_count, FLAGS_huge_page_tlb_size,
FLAGS_bucket_entries_logging_threshold,
FLAGS_if_log_bucket_dist_when_flash, FLAGS_threshold_use_skiplist));
options.prefix_extractor.reset(
rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
} else if (FLAGS_memtablerep == "cuckoo") {
factory.reset(rocksdb::NewHashCuckooRepFactory(
FLAGS_write_buffer_size, FLAGS_average_data_size,
static_cast<uint32_t>(FLAGS_hash_function_count)));
options.prefix_extractor.reset(
rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
} else {
fprintf(stdout, "Unknown memtablerep: %s\n", FLAGS_memtablerep.c_str());
exit(1);
}
rocksdb::InternalKeyComparator internal_key_comp(
rocksdb::BytewiseComparator());
rocksdb::MemTable::KeyComparator key_comp(internal_key_comp);
rocksdb::Arena arena;
rocksdb::WriteBuffer wb(FLAGS_write_buffer_size);
rocksdb::MemTableAllocator memtable_allocator(&arena, &wb);
uint64_t sequence;
auto createMemtableRep = [&] {
sequence = 0;
return factory->CreateMemTableRep(key_comp, &memtable_allocator,
options.prefix_extractor.get(),
options.info_log.get());
};
std::unique_ptr<rocksdb::MemTableRep> memtablerep;
rocksdb::Random64 rng(FLAGS_seed);
const char* benchmarks = FLAGS_benchmarks.c_str();
while (benchmarks != nullptr) {
std::unique_ptr<rocksdb::KeyGenerator> key_gen;
const char* sep = strchr(benchmarks, ',');
rocksdb::Slice name;
if (sep == nullptr) {
name = benchmarks;
benchmarks = nullptr;
} else {
name = rocksdb::Slice(benchmarks, sep - benchmarks);
benchmarks = sep + 1;
}
std::unique_ptr<rocksdb::Benchmark> benchmark;
if (name == rocksdb::Slice("fillseq")) {
memtablerep.reset(createMemtableRep());
key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::SEQUENTIAL,
FLAGS_num_operations));
benchmark.reset(new rocksdb::FillBenchmark(memtablerep.get(),
key_gen.get(), &sequence));
} else if (name == rocksdb::Slice("fillrandom")) {
memtablerep.reset(createMemtableRep());
key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::UNIQUE_RANDOM,
FLAGS_num_operations));
benchmark.reset(new rocksdb::FillBenchmark(memtablerep.get(),
key_gen.get(), &sequence));
} else if (name == rocksdb::Slice("readrandom")) {
key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM,
FLAGS_num_operations));
benchmark.reset(new rocksdb::ReadBenchmark(memtablerep.get(),
key_gen.get(), &sequence));
} else if (name == rocksdb::Slice("readseq")) {
key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::SEQUENTIAL,
FLAGS_num_operations));
benchmark.reset(
new rocksdb::SeqReadBenchmark(memtablerep.get(), &sequence));
} else if (name == rocksdb::Slice("readwrite")) {
memtablerep.reset(createMemtableRep());
key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM,
FLAGS_num_operations));
benchmark.reset(new rocksdb::ReadWriteBenchmark<
rocksdb::ConcurrentReadBenchmarkThread>(memtablerep.get(),
key_gen.get(), &sequence));
} else if (name == rocksdb::Slice("seqreadwrite")) {
memtablerep.reset(createMemtableRep());
key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM,
FLAGS_num_operations));
benchmark.reset(new rocksdb::ReadWriteBenchmark<
rocksdb::SeqConcurrentReadBenchmarkThread>(memtablerep.get(),
key_gen.get(), &sequence));
} else {
std::cout << "WARNING: skipping unknown benchmark '" << name.ToString()
<< std::endl;
continue;
}
std::cout << "Running " << name.ToString() << std::endl;
benchmark->Run();
}
return 0;
}
#endif // GFLAGS

@ -85,9 +85,10 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
// We store the result in keys_.back() and operands_.back()
// if nothing went wrong (i.e.: no operand corruption on disk)
if (success_) {
std::string& key = keys_.back(); // The original key encountered
std::string& original_key =
keys_.back(); // The original key encountered
orig_ikey.type = kTypeValue;
UpdateInternalKey(&key[0], key.size(),
UpdateInternalKey(&original_key[0], original_key.size(),
orig_ikey.sequence, orig_ikey.type);
swap(operands_.back(), merge_result);
} else {
@ -108,17 +109,17 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
// => store result in operands_.back() (and update keys_.back())
// => change the entry type to kTypeValue for keys_.back()
// We are done! Success!
const Slice value = iter->value();
success_ = user_merge_operator_->FullMerge(ikey.user_key, &value,
operands_, &merge_result,
logger_);
const Slice val = iter->value();
success_ = user_merge_operator_->FullMerge(ikey.user_key, &val, operands_,
&merge_result, logger_);
// We store the result in keys_.back() and operands_.back()
// if nothing went wrong (i.e.: no operand corruption on disk)
if (success_) {
std::string& key = keys_.back(); // The original key encountered
std::string& original_key =
keys_.back(); // The original key encountered
orig_ikey.type = kTypeValue;
UpdateInternalKey(&key[0], key.size(),
UpdateInternalKey(&original_key[0], original_key.size(),
orig_ikey.sequence, orig_ikey.type);
swap(operands_.back(), merge_result);
} else {
@ -177,9 +178,9 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
logger_);
if (success_) {
std::string& key = keys_.back(); // The original key encountered
std::string& original_key = keys_.back(); // The original key encountered
orig_ikey.type = kTypeValue;
UpdateInternalKey(&key[0], key.size(),
UpdateInternalKey(&original_key[0], original_key.size(),
orig_ikey.sequence, orig_ikey.type);
// The final value() is always stored in operands_.back()

@ -23,15 +23,11 @@ using namespace std;
using namespace rocksdb;
namespace {
int numMergeOperatorCalls;
void resetNumMergeOperatorCalls() {
numMergeOperatorCalls = 0;
}
size_t num_merge_operator_calls;
void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
int num_partial_merge_calls;
void resetNumPartialMergeCalls() {
num_partial_merge_calls = 0;
}
size_t num_partial_merge_calls;
void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
}
class CountMergeOperator : public AssociativeMergeOperator {
@ -45,7 +41,7 @@ class CountMergeOperator : public AssociativeMergeOperator {
const Slice& value,
std::string* new_value,
Logger* logger) const override {
++numMergeOperatorCalls;
++num_merge_operator_calls;
if (existing_value == nullptr) {
new_value->assign(value.data(), value.size());
return true;
@ -307,31 +303,31 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) {
}
}
void testSuccessiveMerge(
Counters& counters, int max_num_merges, int num_merges) {
void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
size_t num_merges) {
counters.assert_remove("z");
uint64_t sum = 0;
for (int i = 1; i <= num_merges; ++i) {
for (size_t i = 1; i <= num_merges; ++i) {
resetNumMergeOperatorCalls();
counters.assert_add("z", i);
sum += i;
if (i % (max_num_merges + 1) == 0) {
assert(numMergeOperatorCalls == max_num_merges + 1);
assert(num_merge_operator_calls == max_num_merges + 1);
} else {
assert(numMergeOperatorCalls == 0);
assert(num_merge_operator_calls == 0);
}
resetNumMergeOperatorCalls();
assert(counters.assert_get("z") == sum);
assert(numMergeOperatorCalls == i % (max_num_merges + 1));
assert(num_merge_operator_calls == i % (max_num_merges + 1));
}
}
void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
int count) {
void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
size_t min_merge, size_t count) {
FlushOptions o;
o.wait = true;
@ -339,7 +335,7 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
// operands exceeds the threshold.
uint64_t tmp_sum = 0;
resetNumPartialMergeCalls();
for (int i = 1; i <= count; i++) {
for (size_t i = 1; i <= count; i++) {
counters->assert_add("b", i);
tmp_sum += i;
}
@ -348,7 +344,7 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
ASSERT_EQ(tmp_sum, counters->assert_get("b"));
if (count > max_merge) {
// in this case, FullMerge should be called instead.
ASSERT_EQ(num_partial_merge_calls, 0);
ASSERT_EQ(num_partial_merge_calls, 0U);
} else {
// if count >= min_merge, then partial merge should be called once.
ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1));
@ -358,20 +354,18 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
resetNumPartialMergeCalls();
tmp_sum = 0;
db->Put(rocksdb::WriteOptions(), "c", "10");
for (int i = 1; i <= count; i++) {
for (size_t i = 1; i <= count; i++) {
counters->assert_add("c", i);
tmp_sum += i;
}
db->Flush(o);
db->CompactRange(nullptr, nullptr);
ASSERT_EQ(tmp_sum, counters->assert_get("c"));
ASSERT_EQ(num_partial_merge_calls, 0);
ASSERT_EQ(num_partial_merge_calls, 0U);
}
void testSingleBatchSuccessiveMerge(
DB* db,
int max_num_merges,
int num_merges) {
void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
size_t num_merges) {
assert(num_merges > max_num_merges);
Slice key("BatchSuccessiveMerge");
@ -380,7 +374,7 @@ void testSingleBatchSuccessiveMerge(
// Create the batch
WriteBatch batch;
for (int i = 0; i < num_merges; ++i) {
for (size_t i = 0; i < num_merges; ++i) {
batch.Merge(key, merge_value_slice);
}
@ -390,8 +384,9 @@ void testSingleBatchSuccessiveMerge(
Status s = db->Write(WriteOptions(), &batch);
assert(s.ok());
}
assert(numMergeOperatorCalls ==
num_merges - (num_merges % (max_num_merges + 1)));
ASSERT_EQ(
num_merge_operator_calls,
static_cast<size_t>(num_merges - (num_merges % (max_num_merges + 1))));
// Get the value
resetNumMergeOperatorCalls();
@ -403,18 +398,11 @@ void testSingleBatchSuccessiveMerge(
assert(get_value_str.size() == sizeof(uint64_t));
uint64_t get_value = DecodeFixed64(&get_value_str[0]);
ASSERT_EQ(get_value, num_merges * merge_value);
ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1)));
ASSERT_EQ(num_merge_operator_calls,
static_cast<size_t>((num_merges % (max_num_merges + 1))));
}
void runTest(int argc, const string& dbname, const bool use_ttl = false) {
auto db = OpenDb(dbname, use_ttl);
{
cout << "Test read-modify-write counters... \n";
Counters counters(db, 0);
testCounters(counters, db.get(), true);
}
bool compact = false;
if (argc > 1) {
compact = true;
@ -422,13 +410,22 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) {
}
{
cout << "Test merge-based counters... \n";
MergeBasedCounters counters(db, 0);
testCounters(counters, db.get(), compact);
auto db = OpenDb(dbname, use_ttl);
{
cout << "Test read-modify-write counters... \n";
Counters counters(db, 0);
testCounters(counters, db.get(), true);
}
{
cout << "Test merge-based counters... \n";
MergeBasedCounters counters(db, 0);
testCounters(counters, db.get(), compact);
}
}
DestroyDB(dbname, Options());
db.reset();
{
cout << "Test merge in memtable... \n";

@ -6,7 +6,6 @@
#include <algorithm>
#include <iostream>
#include <vector>
#include "/usr/include/valgrind/callgrind.h"
#include "rocksdb/db.h"
#include "rocksdb/perf_context.h"
@ -15,6 +14,8 @@
#include "util/histogram.h"
#include "util/stop_watch.h"
#include "util/testharness.h"
#include "util/thread_status_util.h"
#include "util/string_util.h"
bool FLAGS_random_key = false;
@ -29,7 +30,7 @@ const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test";
namespace rocksdb {
std::shared_ptr<DB> OpenDb() {
std::shared_ptr<DB> OpenDb(bool read_only = false) {
DB* db;
Options options;
options.create_if_missing = true;
@ -39,12 +40,21 @@ std::shared_ptr<DB> OpenDb() {
FLAGS_min_write_buffer_number_to_merge;
if (FLAGS_use_set_based_memetable) {
auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0);
options.memtable_factory.reset(
NewHashSkipListRepFactory(prefix_extractor));
#ifndef ROCKSDB_LITE
options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(0));
options.memtable_factory.reset(NewHashSkipListRepFactory());
#else
fprintf(stderr, "Prefix hash is not supported in lite mode\n");
exit(1);
#endif // ROCKSDB_LITE
}
Status s = DB::Open(options, kDbName, &db);
Status s;
if (!read_only) {
s = DB::Open(options, kDbName, &db);
} else {
s = DB::OpenForReadOnly(options, kDbName, &db);
}
ASSERT_OK(s);
return std::shared_ptr<DB>(db);
}
@ -58,25 +68,26 @@ TEST(PerfContextTest, SeekIntoDeletion) {
ReadOptions read_options;
for (int i = 0; i < FLAGS_total_keys; ++i) {
std::string key = "k" + std::to_string(i);
std::string value = "v" + std::to_string(i);
std::string key = "k" + ToString(i);
std::string value = "v" + ToString(i);
db->Put(write_options, key, value);
}
for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
std::string key = "k" + std::to_string(i);
std::string key = "k" + ToString(i);
db->Delete(write_options, key);
}
HistogramImpl hist_get;
HistogramImpl hist_get_time;
for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
std::string key = "k" + std::to_string(i);
std::string key = "k" + ToString(i);
std::string value;
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
StopWatchNano timer(Env::Default());
timer.Start();
auto status = db->Get(read_options, key, &value);
auto elapsed_nanos = timer.ElapsedNanos();
ASSERT_TRUE(status.IsNotFound());
@ -84,27 +95,32 @@ TEST(PerfContextTest, SeekIntoDeletion) {
hist_get_time.Add(elapsed_nanos);
}
std::cout << "Get uesr key comparison: \n" << hist_get.ToString()
std::cout << "Get user key comparison: \n" << hist_get.ToString()
<< "Get time: \n" << hist_get_time.ToString();
HistogramImpl hist_seek_to_first;
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
{
HistogramImpl hist_seek_to_first;
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->SeekToFirst();
hist_seek_to_first.Add(perf_context.user_key_comparison_count);
auto elapsed_nanos = timer.ElapsedNanos();
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->SeekToFirst();
hist_seek_to_first.Add(perf_context.user_key_comparison_count);
auto elapsed_nanos = timer.ElapsedNanos();
std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString()
<< "ikey skipped: " << perf_context.internal_key_skipped_count << "\n"
<< "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n"
<< "elapsed: " << elapsed_nanos << "\n";
std::cout << "SeekToFirst uesr key comparison: \n"
<< hist_seek_to_first.ToString()
<< "ikey skipped: " << perf_context.internal_key_skipped_count
<< "\n"
<< "idelete skipped: "
<< perf_context.internal_delete_skipped_count << "\n"
<< "elapsed: " << elapsed_nanos << "\n";
}
HistogramImpl hist_seek;
for (int i = 0; i < FLAGS_total_keys; ++i) {
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
std::string key = "k" + std::to_string(i);
std::string key = "k" + ToString(i);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
@ -149,11 +165,12 @@ TEST(PerfContextTest, StopWatchNanoOverhead) {
TEST(PerfContextTest, StopWatchOverhead) {
// profile the timer cost by itself!
const int kTotalIterations = 1000000;
uint64_t elapsed = 0;
std::vector<uint64_t> timings(kTotalIterations);
StopWatch timer(Env::Default());
StopWatch timer(Env::Default(), nullptr, 0, &elapsed);
for (auto& timing : timings) {
timing = timer.ElapsedMicros();
timing = elapsed;
}
HistogramImpl histogram;
@ -166,7 +183,7 @@ TEST(PerfContextTest, StopWatchOverhead) {
std::cout << histogram.ToString();
}
void ProfileKeyComparison() {
void ProfileQueries(bool enabled_time = false) {
DestroyDB(kDbName, Options()); // Start this test with a fresh DB
auto db = OpenDb();
@ -175,74 +192,248 @@ void ProfileKeyComparison() {
ReadOptions read_options;
HistogramImpl hist_put;
HistogramImpl hist_get;
HistogramImpl hist_get_snapshot;
HistogramImpl hist_get_memtable;
HistogramImpl hist_get_files;
HistogramImpl hist_get_post_process;
HistogramImpl hist_num_memtable_checked;
HistogramImpl hist_mget;
HistogramImpl hist_mget_snapshot;
HistogramImpl hist_mget_memtable;
HistogramImpl hist_mget_files;
HistogramImpl hist_mget_post_process;
HistogramImpl hist_mget_num_memtable_checked;
HistogramImpl hist_write_pre_post;
HistogramImpl hist_write_wal_time;
HistogramImpl hist_write_memtable_time;
uint64_t total_db_mutex_nanos = 0;
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
std::vector<int> keys;
const int kFlushFlag = -1;
for (int i = 0; i < FLAGS_total_keys; ++i) {
keys.push_back(i);
if (i == FLAGS_total_keys / 2) {
// Issuing a flush in the middle.
keys.push_back(kFlushFlag);
}
}
if (FLAGS_random_key) {
std::random_shuffle(keys.begin(), keys.end());
}
#ifndef NDEBUG
ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U);
#endif
int num_mutex_waited = 0;
for (const int i : keys) {
std::string key = "k" + std::to_string(i);
std::string value = "v" + std::to_string(i);
if (i == kFlushFlag) {
FlushOptions fo;
db->Flush(fo);
continue;
}
std::string key = "k" + ToString(i);
std::string value = "v" + ToString(i);
std::vector<std::string> values;
perf_context.Reset();
db->Put(write_options, key, value);
if (++num_mutex_waited > 3) {
#ifndef NDEBUG
ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
#endif
}
hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
hist_write_wal_time.Add(perf_context.write_wal_time);
hist_write_memtable_time.Add(perf_context.write_memtable_time);
hist_put.Add(perf_context.user_key_comparison_count);
total_db_mutex_nanos += perf_context.db_mutex_lock_nanos;
}
#ifndef NDEBUG
ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
#endif
for (const int i : keys) {
std::string key = "k" + ToString(i);
std::string value = "v" + ToString(i);
std::vector<Slice> multiget_keys = {Slice(key)};
std::vector<std::string> values;
perf_context.Reset();
db->Get(read_options, key, &value);
hist_get_snapshot.Add(perf_context.get_snapshot_time);
hist_get_memtable.Add(perf_context.get_from_memtable_time);
hist_get_files.Add(perf_context.get_from_output_files_time);
hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
hist_get_post_process.Add(perf_context.get_post_process_time);
hist_get.Add(perf_context.user_key_comparison_count);
perf_context.Reset();
db->MultiGet(read_options, multiget_keys, &values);
hist_mget_snapshot.Add(perf_context.get_snapshot_time);
hist_mget_memtable.Add(perf_context.get_from_memtable_time);
hist_mget_files.Add(perf_context.get_from_output_files_time);
hist_mget_num_memtable_checked.Add(perf_context.get_from_memtable_count);
hist_mget_post_process.Add(perf_context.get_post_process_time);
hist_mget.Add(perf_context.user_key_comparison_count);
}
std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
<< "Get uesr key comparison: \n" << hist_get.ToString();
<< "Get uesr key comparison: \n" << hist_get.ToString()
<< "MultiGet uesr key comparison: \n" << hist_get.ToString();
std::cout << "Put(): Pre and Post Process Time: \n"
<< hist_write_pre_post.ToString()
<< " Writing WAL time: \n"
<< hist_write_wal_time.ToString() << "\n"
<< " Writing Mem Table time: \n"
<< hist_write_memtable_time.ToString() << "\n";
<< hist_write_memtable_time.ToString() << "\n"
<< " Total DB mutex nanos: \n" << total_db_mutex_nanos << "\n";
std::cout << "Get(): Time to get snapshot: \n"
std::cout << "Get(): Time to get snapshot: \n" << hist_get_snapshot.ToString()
<< " Time to get value from memtables: \n"
<< hist_get_memtable.ToString() << "\n"
<< " Time to get value from output files: \n"
<< hist_get_files.ToString() << "\n"
<< " Number of memtables checked: \n"
<< hist_num_memtable_checked.ToString() << "\n"
<< " Time to post process: \n" << hist_get_post_process.ToString()
<< "\n";
std::cout << "MultiGet(): Time to get snapshot: \n"
<< hist_mget_snapshot.ToString()
<< " Time to get value from memtables: \n"
<< hist_mget_memtable.ToString() << "\n"
<< " Time to get value from output files: \n"
<< hist_mget_files.ToString() << "\n"
<< " Number of memtables checked: \n"
<< hist_mget_num_memtable_checked.ToString() << "\n"
<< " Time to post process: \n" << hist_mget_post_process.ToString()
<< "\n";
if (enabled_time) {
ASSERT_GT(hist_get.Average(), 0);
ASSERT_GT(hist_get_snapshot.Average(), 0);
ASSERT_GT(hist_get_memtable.Average(), 0);
ASSERT_GT(hist_get_files.Average(), 0);
ASSERT_GT(hist_get_post_process.Average(), 0);
ASSERT_GT(hist_num_memtable_checked.Average(), 0);
ASSERT_GT(hist_mget.Average(), 0);
ASSERT_GT(hist_mget_snapshot.Average(), 0);
ASSERT_GT(hist_mget_memtable.Average(), 0);
ASSERT_GT(hist_mget_files.Average(), 0);
ASSERT_GT(hist_mget_post_process.Average(), 0);
ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
#ifndef NDEBUG
ASSERT_GT(total_db_mutex_nanos, 2000U);
#endif
}
db.reset();
db = OpenDb(true);
hist_get.Clear();
hist_get_snapshot.Clear();
hist_get_memtable.Clear();
hist_get_files.Clear();
hist_get_post_process.Clear();
hist_num_memtable_checked.Clear();
hist_mget.Clear();
hist_mget_snapshot.Clear();
hist_mget_memtable.Clear();
hist_mget_files.Clear();
hist_mget_post_process.Clear();
hist_mget_num_memtable_checked.Clear();
for (const int i : keys) {
std::string key = "k" + ToString(i);
std::string value = "v" + ToString(i);
std::vector<Slice> multiget_keys = {Slice(key)};
std::vector<std::string> values;
perf_context.Reset();
db->Get(read_options, key, &value);
hist_get_snapshot.Add(perf_context.get_snapshot_time);
hist_get_memtable.Add(perf_context.get_from_memtable_time);
hist_get_files.Add(perf_context.get_from_output_files_time);
hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
hist_get_post_process.Add(perf_context.get_post_process_time);
hist_get.Add(perf_context.user_key_comparison_count);
perf_context.Reset();
db->MultiGet(read_options, multiget_keys, &values);
hist_mget_snapshot.Add(perf_context.get_snapshot_time);
hist_mget_memtable.Add(perf_context.get_from_memtable_time);
hist_mget_files.Add(perf_context.get_from_output_files_time);
hist_mget_num_memtable_checked.Add(perf_context.get_from_memtable_count);
hist_mget_post_process.Add(perf_context.get_post_process_time);
hist_mget.Add(perf_context.user_key_comparison_count);
}
std::cout << "ReadOnly Get uesr key comparison: \n" << hist_get.ToString()
<< "ReadOnly MultiGet uesr key comparison: \n"
<< hist_mget.ToString();
std::cout << "ReadOnly Get(): Time to get snapshot: \n"
<< hist_get_snapshot.ToString()
<< " Time to get value from memtables: \n"
<< hist_get_memtable.ToString() << "\n"
<< " Time to get value from output files: \n"
<< hist_get_files.ToString() << "\n"
<< " Number of memtables checked: \n"
<< hist_num_memtable_checked.ToString() << "\n"
<< " Time to post process: \n"
<< hist_get_post_process.ToString() << "\n";
<< " Time to post process: \n" << hist_get_post_process.ToString()
<< "\n";
std::cout << "ReadOnly MultiGet(): Time to get snapshot: \n"
<< hist_mget_snapshot.ToString()
<< " Time to get value from memtables: \n"
<< hist_mget_memtable.ToString() << "\n"
<< " Time to get value from output files: \n"
<< hist_mget_files.ToString() << "\n"
<< " Number of memtables checked: \n"
<< hist_mget_num_memtable_checked.ToString() << "\n"
<< " Time to post process: \n" << hist_mget_post_process.ToString()
<< "\n";
if (enabled_time) {
ASSERT_GT(hist_get.Average(), 0);
ASSERT_GT(hist_get_memtable.Average(), 0);
ASSERT_GT(hist_get_files.Average(), 0);
ASSERT_GT(hist_num_memtable_checked.Average(), 0);
// In read-only mode Get(), no super version operation is needed
ASSERT_EQ(hist_get_post_process.Average(), 0);
ASSERT_EQ(hist_get_snapshot.Average(), 0);
ASSERT_GT(hist_mget.Average(), 0);
ASSERT_GT(hist_mget_snapshot.Average(), 0);
ASSERT_GT(hist_mget_memtable.Average(), 0);
ASSERT_GT(hist_mget_files.Average(), 0);
ASSERT_GT(hist_mget_post_process.Average(), 0);
ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
}
}
TEST(PerfContextTest, KeyComparisonCount) {
SetPerfLevel(kEnableCount);
ProfileKeyComparison();
ProfileQueries();
SetPerfLevel(kDisable);
ProfileKeyComparison();
ProfileQueries();
SetPerfLevel(kEnableTime);
ProfileKeyComparison();
ProfileQueries(true);
}
// make perf_context_test
@ -281,8 +472,8 @@ TEST(PerfContextTest, SeekKeyComparison) {
SetPerfLevel(kEnableTime);
StopWatchNano timer(Env::Default());
for (const int i : keys) {
std::string key = "k" + std::to_string(i);
std::string value = "v" + std::to_string(i);
std::string key = "k" + ToString(i);
std::string value = "v" + ToString(i);
perf_context.Reset();
timer.Start();
@ -301,8 +492,8 @@ TEST(PerfContextTest, SeekKeyComparison) {
HistogramImpl hist_next;
for (int i = 0; i < FLAGS_total_keys; ++i) {
std::string key = "k" + std::to_string(i);
std::string value = "v" + std::to_string(i);
std::string key = "k" + ToString(i);
std::string value = "v" + ToString(i);
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
perf_context.Reset();

@ -158,7 +158,7 @@ class PlainTableDBTest {
// Return spread of files per level
std::string FilesPerLevel() {
std::string result;
int last_non_zero_offset = 0;
size_t last_non_zero_offset = 0;
for (int level = 0; level < db_->NumberLevels(); level++) {
int f = NumTableFilesAtLevel(level);
char buf[100];
@ -192,16 +192,17 @@ extern const uint64_t kPlainTableMagicNumber;
class TestPlainTableReader : public PlainTableReader {
public:
TestPlainTableReader(const EnvOptions& storage_options,
TestPlainTableReader(const EnvOptions& env_options,
const InternalKeyComparator& icomparator,
EncodingType encoding_type, uint64_t file_size,
int bloom_bits_per_key, double hash_table_ratio,
size_t index_sparseness,
const TableProperties* table_properties,
unique_ptr<RandomAccessFile>&& file,
const Options& options, bool* expect_bloom_not_match,
const ImmutableCFOptions& ioptions,
bool* expect_bloom_not_match,
bool store_index_in_file)
: PlainTableReader(options, std::move(file), storage_options, icomparator,
: PlainTableReader(ioptions, std::move(file), env_options, icomparator,
encoding_type, file_size, table_properties),
expect_bloom_not_match_(expect_bloom_not_match) {
Status s = MmapDataFile();
@ -218,7 +219,7 @@ class TestPlainTableReader : public PlainTableReader {
PlainTablePropertyNames::kBloomVersion);
ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
ASSERT_EQ(bloom_version_ptr->second, std::string("1"));
if (options.bloom_locality > 0) {
if (ioptions.bloom_locality > 0) {
auto num_blocks_ptr = props->user_collected_properties.find(
PlainTablePropertyNames::kNumBloomBlocks);
ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
@ -253,25 +254,26 @@ class TestPlainTableFactory : public PlainTableFactory {
store_index_in_file_(options.store_index_in_file),
expect_bloom_not_match_(expect_bloom_not_match) {}
Status NewTableReader(const Options& options, const EnvOptions& soptions,
Status NewTableReader(const ImmutableCFOptions& ioptions,
const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table) const override {
TableProperties* props = nullptr;
auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
options.env, options.info_log.get(), &props);
ioptions.env, ioptions.info_log, &props);
ASSERT_TRUE(s.ok());
if (store_index_in_file_) {
BlockHandle bloom_block_handle;
s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
options.env, BloomBlockBuilder::kBloomBlock,
ioptions.env, BloomBlockBuilder::kBloomBlock,
&bloom_block_handle);
ASSERT_TRUE(s.ok());
BlockHandle index_block_handle;
s = FindMetaBlock(
file.get(), file_size, kPlainTableMagicNumber, options.env,
file.get(), file_size, kPlainTableMagicNumber, ioptions.env,
PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle);
ASSERT_TRUE(s.ok());
}
@ -284,9 +286,9 @@ class TestPlainTableFactory : public PlainTableFactory {
DecodeFixed32(encoding_type_prop->second.c_str()));
std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
soptions, internal_comparator, encoding_type, file_size,
env_options, internal_comparator, encoding_type, file_size,
bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
std::move(file), options, expect_bloom_not_match_,
std::move(file), ioptions, expect_bloom_not_match_,
store_index_in_file_));
*table = std::move(new_reader);
@ -626,7 +628,7 @@ TEST(PlainTableDBTest, IteratorLargeKeys) {
};
for (size_t i = 0; i < 7; i++) {
ASSERT_OK(Put(key_list[i], std::to_string(i)));
ASSERT_OK(Put(key_list[i], ToString(i)));
}
dbfull()->TEST_FlushMemTable();
@ -637,7 +639,7 @@ TEST(PlainTableDBTest, IteratorLargeKeys) {
for (size_t i = 0; i < 7; i++) {
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(key_list[i], iter->key().ToString());
ASSERT_EQ(std::to_string(i), iter->value().ToString());
ASSERT_EQ(ToString(i), iter->value().ToString());
iter->Next();
}
@ -674,7 +676,7 @@ TEST(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
MakeLongKeyWithPrefix(26, '6')};
for (size_t i = 0; i < 7; i++) {
ASSERT_OK(Put(key_list[i], std::to_string(i)));
ASSERT_OK(Put(key_list[i], ToString(i)));
}
dbfull()->TEST_FlushMemTable();
@ -685,7 +687,7 @@ TEST(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
for (size_t i = 0; i < 7; i++) {
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(key_list[i], iter->key().ToString());
ASSERT_EQ(std::to_string(i), iter->value().ToString());
ASSERT_EQ(ToString(i), iter->value().ToString());
iter->Next();
}
@ -694,40 +696,12 @@ TEST(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
delete iter;
}
// A test comparator which compare two strings in this way:
// (1) first compare prefix of 8 bytes in alphabet order,
// (2) if two strings share the same prefix, sort the other part of the string
// in the reverse alphabet order.
class SimpleSuffixReverseComparator : public Comparator {
public:
SimpleSuffixReverseComparator() {}
virtual const char* Name() const { return "SimpleSuffixReverseComparator"; }
virtual int Compare(const Slice& a, const Slice& b) const {
Slice prefix_a = Slice(a.data(), 8);
Slice prefix_b = Slice(b.data(), 8);
int prefix_comp = prefix_a.compare(prefix_b);
if (prefix_comp != 0) {
return prefix_comp;
} else {
Slice suffix_a = Slice(a.data() + 8, a.size() - 8);
Slice suffix_b = Slice(b.data() + 8, b.size() - 8);
return -(suffix_a.compare(suffix_b));
}
}
virtual void FindShortestSeparator(std::string* start,
const Slice& limit) const {}
virtual void FindShortSuccessor(std::string* key) const {}
};
TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
SimpleSuffixReverseComparator comp;
test::SimpleSuffixReverseComparator comp;
options.comparator = &comp;
DestroyAndReopen(&options);
@ -890,7 +864,7 @@ TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
for (unsigned char i = 1; i <= 3; i++) {
Options options = CurrentOptions();
options.create_if_missing = true;
SimpleSuffixReverseComparator comp;
test::SimpleSuffixReverseComparator comp;
options.comparator = &comp;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4

@ -29,14 +29,14 @@ using GFLAGS::ParseCommandLineFlags;
DEFINE_bool(trigger_deadlock, false,
"issue delete in range scan to trigger PrefixHashMap deadlock");
DEFINE_uint64(bucket_count, 100000, "number of buckets");
DEFINE_int32(bucket_count, 100000, "number of buckets");
DEFINE_uint64(num_locks, 10001, "number of locks");
DEFINE_bool(random_prefix, false, "randomize prefix");
DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
DEFINE_int64(write_buffer_size, 33554432, "");
DEFINE_int64(max_write_buffer_number, 2, "");
DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
DEFINE_int32(max_write_buffer_number, 2, "");
DEFINE_int32(min_write_buffer_number_to_merge, 1, "");
DEFINE_int32(skiplist_height, 4, "");
DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
@ -52,7 +52,8 @@ struct TestKey {
uint64_t prefix;
uint64_t sorted;
TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {}
TestKey(uint64_t _prefix, uint64_t _sorted)
: prefix(_prefix), sorted(_sorted) {}
};
// return a slice backed by test_key
@ -441,7 +442,7 @@ TEST(PrefixTest, DynamicPrefixIterator) {
for (auto prefix : prefixes) {
TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(0);
std::string value = "v" + ToString(0);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);

@ -31,7 +31,10 @@
#ifndef ROCKSDB_LITE
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include "db/builder.h"
#include "db/db_impl.h"
@ -42,10 +45,14 @@
#include "db/memtable.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "db/writebuffer.h"
#include "db/write_batch_internal.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/options.h"
#include "rocksdb/immutable_options.h"
#include "util/scoped_arena_iterator.h"
namespace rocksdb {
@ -58,6 +65,7 @@ class Repairer {
env_(options.env),
icmp_(options.comparator),
options_(SanitizeOptions(dbname, &icmp_, options)),
ioptions_(options_),
raw_table_cache_(
// TableCache can be small since we expect each table to be opened
// once.
@ -65,7 +73,7 @@ class Repairer {
options_.table_cache_remove_scan_count_limit)),
next_file_number_(1) {
table_cache_ =
new TableCache(&options_, storage_options_, raw_table_cache_.get());
new TableCache(ioptions_, env_options_, raw_table_cache_.get());
edit_ = new VersionEdit();
}
@ -87,7 +95,7 @@ class Repairer {
for (size_t i = 0; i < tables_.size(); i++) {
bytes += tables_[i].meta.fd.GetFileSize();
}
Log(options_.info_log,
Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
"**** Repaired rocksdb %s; "
"recovered %zu files; %" PRIu64
"bytes. "
@ -107,8 +115,9 @@ class Repairer {
std::string const dbname_;
Env* const env_;
InternalKeyComparator const icmp_;
Options const options_;
const InternalKeyComparator icmp_;
const Options options_;
const ImmutableCFOptions ioptions_;
std::shared_ptr<Cache> raw_table_cache_;
TableCache* table_cache_;
VersionEdit* edit_;
@ -118,7 +127,7 @@ class Repairer {
std::vector<uint64_t> logs_;
std::vector<TableInfo> tables_;
uint64_t next_file_number_;
const EnvOptions storage_options_;
const EnvOptions env_options_;
Status FindFiles() {
std::vector<std::string> filenames;
@ -167,7 +176,7 @@ class Repairer {
std::string logname = LogFileName(dbname_, logs_[i]);
Status status = ConvertLogToTable(logs_[i]);
if (!status.ok()) {
Log(options_.info_log,
Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
"Log #%" PRIu64 ": ignoring conversion error: %s", logs_[i],
status.ToString().c_str());
}
@ -182,7 +191,8 @@ class Repairer {
uint64_t lognum;
virtual void Corruption(size_t bytes, const Status& s) {
// We print error messages for corruption, but continue repairing.
Log(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s", lognum,
Log(InfoLogLevel::ERROR_LEVEL, info_log,
"Log #%" PRIu64 ": dropping %d bytes; %s", lognum,
static_cast<int>(bytes), s.ToString().c_str());
}
};
@ -190,7 +200,7 @@ class Repairer {
// Open the log file
std::string logname = LogFileName(dbname_, log);
unique_ptr<SequentialFile> lfile;
Status status = env_->NewSequentialFile(logname, &lfile, storage_options_);
Status status = env_->NewSequentialFile(logname, &lfile, env_options_);
if (!status.ok()) {
return status;
}
@ -211,8 +221,10 @@ class Repairer {
std::string scratch;
Slice record;
WriteBatch batch;
MemTable* mem = new MemTable(icmp_, options_);
auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
WriteBuffer wb(options_.db_write_buffer_size);
MemTable* mem = new MemTable(icmp_, ioptions_,
MutableCFOptions(options_, ioptions_), &wb);
auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem);
mem->Ref();
int counter = 0;
while (reader.ReadRecord(&record, &scratch)) {
@ -226,7 +238,8 @@ class Repairer {
if (status.ok()) {
counter += WriteBatchInternal::Count(&batch);
} else {
Log(options_.info_log, "Log #%" PRIu64 ": ignoring %s", log,
Log(InfoLogLevel::WARN_LEVEL,
options_.info_log, "Log #%" PRIu64 ": ignoring %s", log,
status.ToString().c_str());
status = Status::OK(); // Keep going with rest of file
}
@ -236,12 +249,15 @@ class Repairer {
// since ExtractMetaData() will also generate edits.
FileMetaData meta;
meta.fd = FileDescriptor(next_file_number_++, 0, 0);
ReadOptions ro;
ro.total_order_seek = true;
Iterator* iter = mem->NewIterator(ro);
status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
iter, &meta, icmp_, 0, 0, kNoCompression);
delete iter;
{
ReadOptions ro;
ro.total_order_seek = true;
Arena arena;
ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_,
iter.get(), &meta, icmp_, 0, 0, kNoCompression,
CompressionOptions());
}
delete mem->Unref();
delete cf_mems_default;
mem = nullptr;
@ -250,9 +266,9 @@ class Repairer {
table_fds_.push_back(meta.fd);
}
}
Log(options_.info_log,
"Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", log, counter,
meta.fd.GetNumber(), status.ToString().c_str());
Log(InfoLogLevel::INFO_LEVEL, options_.info_log,
"Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
log, counter, meta.fd.GetNumber(), status.ToString().c_str());
return status;
}
@ -267,7 +283,8 @@ class Repairer {
char file_num_buf[kFormatFileNumberBufSize];
FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
file_num_buf, sizeof(file_num_buf));
Log(options_.info_log, "Table #%s: ignoring %s", file_num_buf,
Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
"Table #%s: ignoring %s", file_num_buf,
status.ToString().c_str());
ArchiveFile(fname);
} else {
@ -286,7 +303,7 @@ class Repairer {
file_size);
if (status.ok()) {
Iterator* iter = table_cache_->NewIterator(
ReadOptions(), storage_options_, icmp_, t->meta.fd);
ReadOptions(), env_options_, icmp_, t->meta.fd);
bool empty = true;
ParsedInternalKey parsed;
t->min_sequence = 0;
@ -294,7 +311,8 @@ class Repairer {
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Slice key = iter->key();
if (!ParseInternalKey(key, &parsed)) {
Log(options_.info_log, "Table #%" PRIu64 ": unparsable key %s",
Log(InfoLogLevel::ERROR_LEVEL,
options_.info_log, "Table #%" PRIu64 ": unparsable key %s",
t->meta.fd.GetNumber(), EscapeString(key).c_str());
continue;
}
@ -317,7 +335,8 @@ class Repairer {
}
delete iter;
}
Log(options_.info_log, "Table #%" PRIu64 ": %d entries %s",
Log(InfoLogLevel::INFO_LEVEL,
options_.info_log, "Table #%" PRIu64 ": %d entries %s",
t->meta.fd.GetNumber(), counter, status.ToString().c_str());
return status;
}
@ -326,7 +345,7 @@ class Repairer {
std::string tmp = TempFileName(dbname_, 1);
unique_ptr<WritableFile> file;
Status status = env_->NewWritableFile(
tmp, &file, env_->OptimizeForManifestWrite(storage_options_));
tmp, &file, env_->OptimizeForManifestWrite(env_options_));
if (!status.ok()) {
return status;
}
@ -394,7 +413,8 @@ class Repairer {
new_file.append("/");
new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
Status s = env_->RenameFile(fname, new_file);
Log(options_.info_log, "Archiving %s: %s\n",
Log(InfoLogLevel::INFO_LEVEL,
options_.info_log, "Archiving %s: %s\n",
fname.c_str(), s.ToString().c_str());
}
};

@ -1,810 +0,0 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <algorithm>
#include <set>
#include "rocksdb/db.h"
#include "rocksdb/filter_policy.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "rocksdb/statistics.h"
#include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h"
#include "rocksdb/table.h"
#include "rocksdb/table_properties.h"
#include "table/table_builder.h"
#include "util/hash.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "utilities/merge_operators.h"
using std::unique_ptr;
// IS THIS FILE STILL NEEDED?
namespace rocksdb {
// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
// as production quality.
// SimpleTable requires the input key size to be fixed 16 bytes, value cannot
// be longer than 150000 bytes and stored data on disk in this format:
// +--------------------------------------------+ <= key1 offset
// | key1 | value_size (4 bytes) | |
// +----------------------------------------+ |
// | value1 |
// | |
// +----------------------------------------+---+ <= key2 offset
// | key2 | value_size (4 bytes) | |
// +----------------------------------------+ |
// | value2 |
// | |
// | ...... |
// +-----------------+--------------------------+ <= index_block_offset
// | key1 | key1 offset (8 bytes) |
// +-----------------+--------------------------+
// | key2 | key2 offset (8 bytes) |
// +-----------------+--------------------------+
// | key3 | key3 offset (8 bytes) |
// +-----------------+--------------------------+
// | ...... |
// +-----------------+------------+-------------+
// | index_block_offset (8 bytes) |
// +------------------------------+
// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
// as production quality.
class SimpleTableReader: public TableReader {
public:
// Attempt to open the table that is stored in bytes [0..file_size)
// of "file", and read the metadata entries necessary to allow
// retrieving data from the table.
//
// If successful, returns ok and sets "*table" to the newly opened
// table. The client should delete "*table" when no longer needed.
// If there was an error while initializing the table, sets "*table"
// to nullptr and returns a non-ok status. Does not take ownership of
// "*source", but the client must ensure that "source" remains live
// for the duration of the returned table's lifetime.
//
// *file must remain live while this Table is in use.
static Status Open(const Options& options, const EnvOptions& soptions,
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
unique_ptr<TableReader>* table_reader);
Iterator* NewIterator(const ReadOptions&, Arena* arena) override;
Status Get(const ReadOptions&, const Slice& key, void* arg,
bool (*handle_result)(void* arg, const ParsedInternalKey& k,
const Slice& v),
void (*mark_key_may_exist)(void*) = nullptr) override;
uint64_t ApproximateOffsetOf(const Slice& key) override;
virtual size_t ApproximateMemoryUsage() const override { return 0; }
void SetupForCompaction() override;
std::shared_ptr<const TableProperties> GetTableProperties() const override;
~SimpleTableReader();
private:
struct Rep;
Rep* rep_;
explicit SimpleTableReader(Rep* rep) {
rep_ = rep;
}
friend class TableCache;
friend class SimpleTableIterator;
Status GetOffset(const Slice& target, uint64_t* offset);
// No copying allowed
explicit SimpleTableReader(const TableReader&) = delete;
void operator=(const TableReader&) = delete;
};
// Iterator to iterate SimpleTable
class SimpleTableIterator: public Iterator {
public:
explicit SimpleTableIterator(SimpleTableReader* table);
~SimpleTableIterator();
bool Valid() const;
void SeekToFirst();
void SeekToLast();
void Seek(const Slice& target);
void Next();
void Prev();
Slice key() const;
Slice value() const;
Status status() const;
private:
SimpleTableReader* table_;
uint64_t offset_;
uint64_t next_offset_;
Slice key_;
Slice value_;
char tmp_str_[4];
char* key_str_;
char* value_str_;
int value_str_len_;
Status status_;
// No copying allowed
SimpleTableIterator(const SimpleTableIterator&) = delete;
void operator=(const Iterator&) = delete;
};
struct SimpleTableReader::Rep {
~Rep() {
}
Rep(const EnvOptions& storage_options, uint64_t index_start_offset,
int num_entries) :
soptions(storage_options), index_start_offset(index_start_offset),
num_entries(num_entries) {
}
Options options;
const EnvOptions& soptions;
Status status;
unique_ptr<RandomAccessFile> file;
uint64_t index_start_offset;
int num_entries;
std::shared_ptr<TableProperties> table_properties;
const static int user_key_size = 16;
const static int offset_length = 8;
const static int key_footer_len = 8;
static int GetInternalKeyLength() {
return user_key_size + key_footer_len;
}
};
SimpleTableReader::~SimpleTableReader() {
delete rep_;
}
Status SimpleTableReader::Open(const Options& options,
const EnvOptions& soptions,
unique_ptr<RandomAccessFile> && file,
uint64_t size,
unique_ptr<TableReader>* table_reader) {
char footer_space[Rep::offset_length];
Slice footer_input;
Status s = file->Read(size - Rep::offset_length, Rep::offset_length,
&footer_input, footer_space);
if (s.ok()) {
uint64_t index_start_offset = DecodeFixed64(footer_space);
int num_entries = (size - Rep::offset_length - index_start_offset)
/ (Rep::GetInternalKeyLength() + Rep::offset_length);
SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions,
index_start_offset,
num_entries);
rep->file = std::move(file);
rep->options = options;
table_reader->reset(new SimpleTableReader(rep));
}
return s;
}
void SimpleTableReader::SetupForCompaction() {
}
std::shared_ptr<const TableProperties> SimpleTableReader::GetTableProperties()
const {
return rep_->table_properties;
}
Iterator* SimpleTableReader::NewIterator(const ReadOptions& options,
Arena* arena) {
if (arena == nullptr) {
return new SimpleTableIterator(this);
} else {
auto mem = arena->AllocateAligned(sizeof(SimpleTableIterator));
return new (mem) SimpleTableIterator(this);
}
}
Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
uint32_t left = 0;
uint32_t right = rep_->num_entries - 1;
char key_chars[Rep::GetInternalKeyLength()];
Slice tmp_slice;
uint32_t target_offset = 0;
while (left <= right) {
uint32_t mid = (left + right + 1) / 2;
uint64_t offset_to_read = rep_->index_start_offset
+ (Rep::GetInternalKeyLength() + Rep::offset_length) * mid;
Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(),
&tmp_slice, key_chars);
if (!s.ok()) {
return s;
}
InternalKeyComparator ikc(rep_->options.comparator);
int compare_result = ikc.Compare(tmp_slice, target);
if (compare_result < 0) {
if (left == right) {
target_offset = right + 1;
break;
}
left = mid;
} else {
if (left == right) {
target_offset = left;
break;
}
right = mid - 1;
}
}
if (target_offset >= (uint32_t) rep_->num_entries) {
*offset = rep_->index_start_offset;
return Status::OK();
}
char value_offset_chars[Rep::offset_length];
int64_t offset_for_value_offset = rep_->index_start_offset
+ (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset
+ Rep::GetInternalKeyLength();
Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length,
&tmp_slice, value_offset_chars);
if (s.ok()) {
*offset = DecodeFixed64(value_offset_chars);
}
return s;
}
Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
void* arg,
bool (*saver)(void*, const ParsedInternalKey&,
const Slice&),
void (*mark_key_may_exist)(void*)) {
Status s;
SimpleTableIterator* iter = new SimpleTableIterator(this);
for (iter->Seek(k); iter->Valid(); iter->Next()) {
ParsedInternalKey parsed_key;
if (!ParseInternalKey(iter->key(), &parsed_key)) {
return Status::Corruption(Slice());
}
if (!(*saver)(arg, parsed_key, iter->value())) {
break;
}
}
s = iter->status();
delete iter;
return s;
}
uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
return 0;
}
SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) :
table_(table) {
key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()];
value_str_len_ = -1;
SeekToFirst();
}
SimpleTableIterator::~SimpleTableIterator() {
delete[] key_str_;
if (value_str_len_ >= 0) {
delete[] value_str_;
}
}
bool SimpleTableIterator::Valid() const {
return offset_ < table_->rep_->index_start_offset;
}
void SimpleTableIterator::SeekToFirst() {
next_offset_ = 0;
Next();
}
void SimpleTableIterator::SeekToLast() {
assert(false);
}
void SimpleTableIterator::Seek(const Slice& target) {
Status s = table_->GetOffset(target, &next_offset_);
if (!s.ok()) {
status_ = s;
}
Next();
}
void SimpleTableIterator::Next() {
offset_ = next_offset_;
if (offset_ >= table_->rep_->index_start_offset) {
return;
}
Slice result;
int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength();
Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result,
key_str_);
next_offset_ += internal_key_size;
key_ = result;
Slice value_size_slice;
s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_);
next_offset_ += 4;
uint32_t value_size = DecodeFixed32(tmp_str_);
Slice value_slice;
if ((int) value_size > value_str_len_) {
if (value_str_len_ >= 0) {
delete[] value_str_;
}
value_str_ = new char[value_size];
value_str_len_ = value_size;
}
s = table_->rep_->file->Read(next_offset_, value_size, &value_slice,
value_str_);
next_offset_ += value_size;
value_ = value_slice;
}
void SimpleTableIterator::Prev() {
assert(false);
}
Slice SimpleTableIterator::key() const {
Log(table_->rep_->options.info_log, "key!!!!");
return key_;
}
Slice SimpleTableIterator::value() const {
return value_;
}
Status SimpleTableIterator::status() const {
return status_;
}
class SimpleTableBuilder: public TableBuilder {
public:
// Create a builder that will store the contents of the table it is
// building in *file. Does not close the file. It is up to the
// caller to close the file after calling Finish(). The output file
// will be part of level specified by 'level'. A value of -1 means
// that the caller does not know which level the output file will reside.
SimpleTableBuilder(const Options& options, WritableFile* file,
CompressionType compression_type);
// REQUIRES: Either Finish() or Abandon() has been called.
~SimpleTableBuilder();
// Add key,value to the table being constructed.
// REQUIRES: key is after any previously added key according to comparator.
// REQUIRES: Finish(), Abandon() have not been called
void Add(const Slice& key, const Slice& value) override;
// Return non-ok iff some error has been detected.
Status status() const override;
// Finish building the table. Stops using the file passed to the
// constructor after this function returns.
// REQUIRES: Finish(), Abandon() have not been called
Status Finish() override;
// Indicate that the contents of this builder should be abandoned. Stops
// using the file passed to the constructor after this function returns.
// If the caller is not going to call Finish(), it must call Abandon()
// before destroying this builder.
// REQUIRES: Finish(), Abandon() have not been called
void Abandon() override;
// Number of calls to Add() so far.
uint64_t NumEntries() const override;
// Size of the file generated so far. If invoked after a successful
// Finish() call, returns the size of the final generated file.
uint64_t FileSize() const override;
private:
struct Rep;
Rep* rep_;
// No copying allowed
SimpleTableBuilder(const SimpleTableBuilder&) = delete;
void operator=(const SimpleTableBuilder&) = delete;
};
struct SimpleTableBuilder::Rep {
Options options;
WritableFile* file;
uint64_t offset = 0;
Status status;
uint64_t num_entries = 0;
bool closed = false; // Either Finish() or Abandon() has been called.
const static int user_key_size = 16;
const static int offset_length = 8;
const static int key_footer_len = 8;
static int GetInternalKeyLength() {
return user_key_size + key_footer_len;
}
std::string index;
Rep(const Options& opt, WritableFile* f) :
options(opt), file(f) {
}
~Rep() {
}
};
SimpleTableBuilder::SimpleTableBuilder(const Options& options,
WritableFile* file,
CompressionType compression_type) :
rep_(new SimpleTableBuilder::Rep(options, file)) {
}
SimpleTableBuilder::~SimpleTableBuilder() {
delete (rep_);
}
void SimpleTableBuilder::Add(const Slice& key, const Slice& value) {
assert((int ) key.size() == Rep::GetInternalKeyLength());
// Update index
rep_->index.append(key.data(), key.size());
PutFixed64(&(rep_->index), rep_->offset);
// Write key-value pair
rep_->file->Append(key);
rep_->offset += Rep::GetInternalKeyLength();
std::string size;
int value_size = value.size();
PutFixed32(&size, value_size);
Slice sizeSlice(size);
rep_->file->Append(sizeSlice);
rep_->file->Append(value);
rep_->offset += value_size + 4;
rep_->num_entries++;
}
Status SimpleTableBuilder::status() const {
return Status::OK();
}
Status SimpleTableBuilder::Finish() {
Rep* r = rep_;
assert(!r->closed);
r->closed = true;
uint64_t index_offset = rep_->offset;
Slice index_slice(rep_->index);
rep_->file->Append(index_slice);
rep_->offset += index_slice.size();
std::string index_offset_str;
PutFixed64(&index_offset_str, index_offset);
Slice foot_slice(index_offset_str);
rep_->file->Append(foot_slice);
rep_->offset += foot_slice.size();
return Status::OK();
}
void SimpleTableBuilder::Abandon() {
rep_->closed = true;
}
uint64_t SimpleTableBuilder::NumEntries() const {
return rep_->num_entries;
}
uint64_t SimpleTableBuilder::FileSize() const {
return rep_->offset;
}
class SimpleTableFactory: public TableFactory {
public:
~SimpleTableFactory() {
}
SimpleTableFactory() {
}
const char* Name() const override {
return "SimpleTable";
}
Status NewTableReader(const Options& options, const EnvOptions& soptions,
const InternalKeyComparator& internal_key,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table_reader) const;
TableBuilder* NewTableBuilder(const Options& options,
const InternalKeyComparator& internal_key,
WritableFile* file,
CompressionType compression_type) const;
virtual Status SanitizeDBOptions(DBOptions* db_opts) const override {
return Status::OK();
}
virtual std::string GetPrintableTableOptions() const override {
return std::string();
}
};
Status SimpleTableFactory::NewTableReader(
const Options& options, const EnvOptions& soptions,
const InternalKeyComparator& internal_key,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table_reader) const {
return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
table_reader);
}
TableBuilder* SimpleTableFactory::NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_key,
WritableFile* file, CompressionType compression_type) const {
return new SimpleTableBuilder(options, file, compression_type);
}
class SimpleTableDBTest {
protected:
public:
std::string dbname_;
Env* env_;
DB* db_;
Options last_options_;
SimpleTableDBTest() :
env_(Env::Default()) {
dbname_ = test::TmpDir() + "/simple_table_db_test";
ASSERT_OK(DestroyDB(dbname_, Options()));
db_ = nullptr;
Reopen();
}
~SimpleTableDBTest() {
delete db_;
ASSERT_OK(DestroyDB(dbname_, Options()));
}
// Return the current option configuration.
Options CurrentOptions() {
Options options;
options.table_factory.reset(new SimpleTableFactory());
return options;
}
DBImpl* dbfull() {
return reinterpret_cast<DBImpl*>(db_);
}
void Reopen(Options* options = nullptr) {
ASSERT_OK(TryReopen(options));
}
void Close() {
delete db_;
db_ = nullptr;
}
void DestroyAndReopen(Options* options = nullptr) {
//Destroy using last options
Destroy(&last_options_);
ASSERT_OK(TryReopen(options));
}
void Destroy(Options* options) {
delete db_;
db_ = nullptr;
ASSERT_OK(DestroyDB(dbname_, *options));
}
Status PureReopen(Options* options, DB** db) {
return DB::Open(*options, dbname_, db);
}
Status TryReopen(Options* options = nullptr) {
delete db_;
db_ = nullptr;
Options opts;
if (options != nullptr) {
opts = *options;
} else {
opts = CurrentOptions();
opts.create_if_missing = true;
}
last_options_ = opts;
return DB::Open(opts, dbname_, &db_);
}
Status Put(const Slice& k, const Slice& v) {
return db_->Put(WriteOptions(), k, v);
}
Status Delete(const std::string& k) {
return db_->Delete(WriteOptions(), k);
}
std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
ReadOptions options;
options.snapshot = snapshot;
std::string result;
Status s = db_->Get(options, k, &result);
if (s.IsNotFound()) {
result = "NOT_FOUND";
} else if (!s.ok()) {
result = s.ToString();
}
return result;
}
int NumTableFilesAtLevel(int level) {
std::string property;
ASSERT_TRUE(
db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
&property));
return atoi(property.c_str());
}
// Return spread of files per level
std::string FilesPerLevel() {
std::string result;
int last_non_zero_offset = 0;
for (int level = 0; level < db_->NumberLevels(); level++) {
int f = NumTableFilesAtLevel(level);
char buf[100];
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
result += buf;
if (f > 0) {
last_non_zero_offset = result.size();
}
}
result.resize(last_non_zero_offset);
return result;
}
std::string IterStatus(Iterator* iter) {
std::string result;
if (iter->Valid()) {
result = iter->key().ToString() + "->" + iter->value().ToString();
} else {
result = "(invalid)";
}
return result;
}
};
TEST(SimpleTableDBTest, Empty) {
ASSERT_TRUE(db_ != nullptr);
ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
}
TEST(SimpleTableDBTest, ReadWrite) {
ASSERT_OK(Put("0000000000000foo", "v1"));
ASSERT_EQ("v1", Get("0000000000000foo"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("0000000000000foo", "v3"));
ASSERT_EQ("v3", Get("0000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
}
TEST(SimpleTableDBTest, Flush) {
ASSERT_OK(Put("0000000000000foo", "v1"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("0000000000000foo", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v3", Get("0000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
}
TEST(SimpleTableDBTest, Flush2) {
ASSERT_OK(Put("0000000000000bar", "b"));
ASSERT_OK(Put("0000000000000foo", "v1"));
dbfull()->TEST_FlushMemTable();
ASSERT_OK(Put("0000000000000foo", "v2"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v2", Get("0000000000000foo"));
ASSERT_OK(Put("0000000000000eee", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v3", Get("0000000000000eee"));
ASSERT_OK(Delete("0000000000000bar"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
ASSERT_OK(Put("0000000000000eee", "v5"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v5", Get("0000000000000eee"));
}
static std::string Key(int i) {
char buf[100];
snprintf(buf, sizeof(buf), "key_______%06d", i);
return std::string(buf);
}
static std::string RandomString(Random* rnd, int len) {
std::string r;
test::RandomString(rnd, len, &r);
return r;
}
TEST(SimpleTableDBTest, CompactionTrigger) {
Options options = CurrentOptions();
options.write_buffer_size = 100 << 10; //100KB
options.num_levels = 3;
options.max_mem_compaction_level = 0;
options.level0_file_num_compaction_trigger = 3;
Reopen(&options);
Random rnd(301);
for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
num++) {
std::vector<std::string> values;
// Write 120KB (12 values, each 10K)
for (int i = 0; i < 12; i++) {
values.push_back(RandomString(&rnd, 10000));
ASSERT_OK(Put(Key(i), values[i]));
}
dbfull()->TEST_WaitForFlushMemTable();
ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
}
//generate one more file in level-0, and should trigger level-0 compaction
std::vector<std::string> values;
for (int i = 0; i < 12; i++) {
values.push_back(RandomString(&rnd, 10000));
ASSERT_OK(Put(Key(i), values[i]));
}
dbfull()->TEST_WaitForCompact();
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_EQ(NumTableFilesAtLevel(1), 1);
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

@ -32,10 +32,10 @@
#pragma once
#include <assert.h>
#include <atomic>
#include <stdlib.h>
#include "util/arena.h"
#include "port/port.h"
#include "util/arena.h"
#include "util/allocator.h"
#include "util/random.h"
namespace rocksdb {
@ -47,9 +47,9 @@ class SkipList {
public:
// Create a new SkipList object that will use "cmp" for comparing keys,
// and will allocate memory using "*arena". Objects allocated in the arena
// must remain allocated for the lifetime of the skiplist object.
explicit SkipList(Comparator cmp, Arena* arena,
// and will allocate memory using "*allocator". Objects allocated in the
// allocator must remain allocated for the lifetime of the skiplist object.
explicit SkipList(Comparator cmp, Allocator* allocator,
int32_t max_height = 12, int32_t branching_factor = 4);
// Insert key into the list.
@ -109,21 +109,20 @@ class SkipList {
// Immutable after construction
Comparator const compare_;
Arena* const arena_; // Arena used for allocations of nodes
Allocator* const allocator_; // Allocator used for allocations of nodes
Node* const head_;
// Modified only by Insert(). Read racily by readers, but stale
// values are ok.
port::AtomicPointer max_height_; // Height of the entire list
std::atomic<int> max_height_; // Height of the entire list
// Used for optimizing sequential insert patterns
Node** prev_;
int32_t prev_height_;
inline int GetMaxHeight() const {
return static_cast<int>(
reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
return max_height_.load(std::memory_order_relaxed);
}
// Read/written only by Insert().
@ -169,35 +168,35 @@ struct SkipList<Key, Comparator>::Node {
assert(n >= 0);
// Use an 'acquire load' so that we observe a fully initialized
// version of the returned Node.
return reinterpret_cast<Node*>(next_[n].Acquire_Load());
return (next_[n].load(std::memory_order_acquire));
}
void SetNext(int n, Node* x) {
assert(n >= 0);
// Use a 'release store' so that anybody who reads through this
// pointer observes a fully initialized version of the inserted node.
next_[n].Release_Store(x);
next_[n].store(x, std::memory_order_release);
}
// No-barrier variants that can be safely used in a few locations.
Node* NoBarrier_Next(int n) {
assert(n >= 0);
return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
return next_[n].load(std::memory_order_relaxed);
}
void NoBarrier_SetNext(int n, Node* x) {
assert(n >= 0);
next_[n].NoBarrier_Store(x);
next_[n].store(x, std::memory_order_relaxed);
}
private:
// Array of length equal to the node height. next_[0] is lowest level link.
port::AtomicPointer next_[1];
std::atomic<Node*> next_[1];
};
template<typename Key, class Comparator>
typename SkipList<Key, Comparator>::Node*
SkipList<Key, Comparator>::NewNode(const Key& key, int height) {
char* mem = arena_->AllocateAligned(
sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
char* mem = allocator_->AllocateAligned(
sizeof(Node) + sizeof(std::atomic<Node*>) * (height - 1));
return new (mem) Node(key);
}
@ -356,23 +355,24 @@ typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
}
template<typename Key, class Comparator>
SkipList<Key, Comparator>::SkipList(const Comparator cmp, Arena* arena,
SkipList<Key, Comparator>::SkipList(const Comparator cmp, Allocator* allocator,
int32_t max_height,
int32_t branching_factor)
: kMaxHeight_(max_height),
kBranching_(branching_factor),
compare_(cmp),
arena_(arena),
allocator_(allocator),
head_(NewNode(0 /* any key will do */, max_height)),
max_height_(reinterpret_cast<void*>(1)),
max_height_(1),
prev_height_(1),
rnd_(0xdeadbeef) {
assert(kMaxHeight_ > 0);
assert(kBranching_ > 0);
// Allocate the prev_ Node* array, directly from the passed-in arena.
// Allocate the prev_ Node* array, directly from the passed-in allocator.
// prev_ does not need to be freed, as its life cycle is tied up with
// the arena as a whole.
prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_);
// the allocator as a whole.
prev_ = reinterpret_cast<Node**>(
allocator_->AllocateAligned(sizeof(Node*) * kMaxHeight_));
for (int i = 0; i < kMaxHeight_; i++) {
head_->SetNext(i, nullptr);
prev_[i] = head_;
@ -402,7 +402,7 @@ void SkipList<Key, Comparator>::Insert(const Key& key) {
// the loop below. In the former case the reader will
// immediately drop to the next level since nullptr sorts after all
// keys. In the latter case the reader will use the new node.
max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
max_height_.store(height, std::memory_order_relaxed);
}
x = NewNode(key, height);

@ -191,13 +191,11 @@ class ConcurrentTest {
// Per-key generation
struct State {
port::AtomicPointer generation[K];
void Set(int k, intptr_t v) {
generation[k].Release_Store(reinterpret_cast<void*>(v));
}
intptr_t Get(int k) {
return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
std::atomic<int> generation[K];
void Set(int k, int v) {
generation[k].store(v, std::memory_order_release);
}
int Get(int k) { return generation[k].load(std::memory_order_acquire); }
State() {
for (unsigned int k = 0; k < K; k++) {
@ -221,9 +219,9 @@ class ConcurrentTest {
// REQUIRES: External synchronization
void WriteStep(Random* rnd) {
const uint32_t k = rnd->Next() % K;
const intptr_t g = current_.Get(k) + 1;
const Key key = MakeKey(k, g);
list_.Insert(key);
const int g = current_.Get(k) + 1;
const Key new_key = MakeKey(k, g);
list_.Insert(new_key);
current_.Set(k, g);
}
@ -255,11 +253,10 @@ class ConcurrentTest {
// Note that generation 0 is never inserted, so it is ok if
// <*,0,*> is missing.
ASSERT_TRUE((gen(pos) == 0U) ||
(gen(pos) > (uint64_t)initial_state.Get(key(pos)))
) << "key: " << key(pos)
<< "; gen: " << gen(pos)
<< "; initgen: "
<< initial_state.Get(key(pos));
(gen(pos) > static_cast<uint64_t>(initial_state.Get(
static_cast<int>(key(pos))))))
<< "key: " << key(pos) << "; gen: " << gen(pos)
<< "; initgen: " << initial_state.Get(static_cast<int>(key(pos)));
// Advance to next key in the valid key space
if (key(pos) < key(current)) {
@ -303,7 +300,7 @@ class TestState {
public:
ConcurrentTest t_;
int seed_;
port::AtomicPointer quit_flag_;
std::atomic<bool> quit_flag_;
enum ReaderState {
STARTING,
@ -312,10 +309,7 @@ class TestState {
};
explicit TestState(int s)
: seed_(s),
quit_flag_(nullptr),
state_(STARTING),
state_cv_(&mu_) {}
: seed_(s), quit_flag_(false), state_(STARTING), state_cv_(&mu_) {}
void Wait(ReaderState s) {
mu_.Lock();
@ -343,7 +337,7 @@ static void ConcurrentReader(void* arg) {
Random rnd(state->seed_);
int64_t reads = 0;
state->Change(TestState::RUNNING);
while (!state->quit_flag_.Acquire_Load()) {
while (!state->quit_flag_.load(std::memory_order_acquire)) {
state->t_.ReadStep(&rnd);
++reads;
}
@ -362,10 +356,10 @@ static void RunConcurrent(int run) {
TestState state(seed + 1);
Env::Default()->Schedule(ConcurrentReader, &state);
state.Wait(TestState::RUNNING);
for (int i = 0; i < kSize; i++) {
for (int k = 0; k < kSize; k++) {
state.t_.WriteStep(&rnd);
}
state.quit_flag_.Release_Store(&state); // Any non-nullptr arg will do
state.quit_flag_.store(true, std::memory_order_release);
state.Wait(TestState::DONE);
}
}

@ -20,6 +20,8 @@ class SnapshotImpl : public Snapshot {
public:
SequenceNumber number_; // const after creation
virtual SequenceNumber GetSequenceNumber() const { return number_; }
private:
friend class SnapshotList;
@ -28,6 +30,8 @@ class SnapshotImpl : public Snapshot {
SnapshotImpl* next_;
SnapshotList* list_; // just for sanity checks
int64_t unix_time_;
};
class SnapshotList {
@ -36,20 +40,23 @@ class SnapshotList {
list_.prev_ = &list_;
list_.next_ = &list_;
list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging
count_ = 0;
}
bool empty() const { return list_.next_ == &list_; }
SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
const SnapshotImpl* New(SequenceNumber seq) {
const SnapshotImpl* New(SequenceNumber seq, uint64_t unix_time) {
SnapshotImpl* s = new SnapshotImpl;
s->number_ = seq;
s->unix_time_ = unix_time;
s->list_ = this;
s->next_ = &list_;
s->prev_ = list_.prev_;
s->prev_->next_ = s;
s->next_->prev_ = s;
count_++;
return s;
}
@ -57,6 +64,7 @@ class SnapshotList {
assert(s->list_ == this);
s->prev_->next_ = s->next_;
s->next_->prev_ = s->prev_;
count_--;
delete s;
}
@ -71,16 +79,27 @@ class SnapshotList {
}
// get the sequence number of the most recent snapshot
const SequenceNumber GetNewest() {
SequenceNumber GetNewest() {
if (empty()) {
return 0;
}
return newest()->number_;
}
int64_t GetOldestSnapshotTime() const {
if (empty()) {
return 0;
} else {
return oldest()->unix_time_;
}
}
uint64_t count() const { return count_; }
private:
// Dummy head of doubly-linked list of snapshots
SnapshotImpl list_;
uint64_t count_;
};
} // namespace rocksdb

@ -15,6 +15,7 @@
#include "rocksdb/statistics.h"
#include "table/iterator_wrapper.h"
#include "table/table_reader.h"
#include "table/get_context.h"
#include "util/coding.h"
#include "util/stop_watch.h"
@ -36,12 +37,10 @@ static Slice GetSliceForFileNumber(const uint64_t* file_number) {
sizeof(*file_number));
}
TableCache::TableCache(const Options* options,
const EnvOptions& storage_options, Cache* const cache)
: env_(options->env),
db_paths_(options->db_paths),
options_(options),
storage_options_(storage_options),
TableCache::TableCache(const ImmutableCFOptions& ioptions,
const EnvOptions& env_options, Cache* const cache)
: ioptions_(ioptions),
env_options_(env_options),
cache_(cache) {}
TableCache::~TableCache() {
@ -55,7 +54,7 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) {
cache_->Release(handle);
}
Status TableCache::FindTable(const EnvOptions& toptions,
Status TableCache::FindTable(const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator,
const FileDescriptor& fd, Cache::Handle** handle,
const bool no_io) {
@ -68,28 +67,27 @@ Status TableCache::FindTable(const EnvOptions& toptions,
return Status::Incomplete("Table not found in table_cache, no_io is set");
}
std::string fname =
TableFileName(db_paths_, fd.GetNumber(), fd.GetPathId());
TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId());
unique_ptr<RandomAccessFile> file;
unique_ptr<TableReader> table_reader;
s = env_->NewRandomAccessFile(fname, &file, toptions);
RecordTick(options_->statistics.get(), NO_FILE_OPENS);
s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options);
RecordTick(ioptions_.statistics, NO_FILE_OPENS);
if (s.ok()) {
if (options_->advise_random_on_open) {
if (ioptions_.advise_random_on_open) {
file->Hint(RandomAccessFile::RANDOM);
}
StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
s = options_->table_factory->NewTableReader(
*options_, toptions, internal_comparator, std::move(file),
StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
s = ioptions_.table_factory->NewTableReader(
ioptions_, env_options, internal_comparator, std::move(file),
fd.GetFileSize(), &table_reader);
}
if (!s.ok()) {
assert(table_reader == nullptr);
RecordTick(options_->statistics.get(), NO_FILE_ERRORS);
RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
// We do not cache error results so that if the error is transient,
// or somebody repairs the file, we recover automatically.
} else {
assert(file.get() == nullptr);
*handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry);
}
}
@ -97,7 +95,7 @@ Status TableCache::FindTable(const EnvOptions& toptions,
}
Iterator* TableCache::NewIterator(const ReadOptions& options,
const EnvOptions& toptions,
const EnvOptions& env_options,
const InternalKeyComparator& icomparator,
const FileDescriptor& fd,
TableReader** table_reader_ptr,
@ -109,7 +107,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
Cache::Handle* handle = nullptr;
Status s;
if (table_reader == nullptr) {
s = FindTable(toptions, icomparator, fd, &handle,
s = FindTable(env_options, icomparator, fd, &handle,
options.read_tier == kBlockCacheTier);
if (!s.ok()) {
return NewErrorIterator(s, arena);
@ -134,34 +132,33 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
Status TableCache::Get(const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileDescriptor& fd, const Slice& k, void* arg,
bool (*saver)(void*, const ParsedInternalKey&,
const Slice&),
void (*mark_key_may_exist)(void*)) {
const FileDescriptor& fd, const Slice& k,
GetContext* get_context) {
TableReader* t = fd.table_reader;
Status s;
Cache::Handle* handle = nullptr;
if (!t) {
s = FindTable(storage_options_, internal_comparator, fd, &handle,
s = FindTable(env_options_, internal_comparator, fd, &handle,
options.read_tier == kBlockCacheTier);
if (s.ok()) {
t = GetTableReaderFromHandle(handle);
}
}
if (s.ok()) {
s = t->Get(options, k, arg, saver, mark_key_may_exist);
s = t->Get(options, k, get_context);
if (handle != nullptr) {
ReleaseHandle(handle);
}
} else if (options.read_tier && s.IsIncomplete()) {
// Couldnt find Table in cache but treat as kFound if no_io set
(*mark_key_may_exist)(arg);
get_context->MarkKeyMayExist();
return Status::OK();
}
return s;
}
Status TableCache::GetTableProperties(
const EnvOptions& toptions,
const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
std::shared_ptr<const TableProperties>* properties, bool no_io) {
Status s;
@ -174,7 +171,7 @@ Status TableCache::GetTableProperties(
}
Cache::Handle* table_handle = nullptr;
s = FindTable(toptions, internal_comparator, fd, &table_handle, no_io);
s = FindTable(env_options, internal_comparator, fd, &table_handle, no_io);
if (!s.ok()) {
return s;
}
@ -186,7 +183,7 @@ Status TableCache::GetTableProperties(
}
size_t TableCache::GetMemoryUsageByTableReader(
const EnvOptions& toptions,
const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator,
const FileDescriptor& fd) {
Status s;
@ -197,7 +194,7 @@ size_t TableCache::GetMemoryUsageByTableReader(
}
Cache::Handle* table_handle = nullptr;
s = FindTable(toptions, internal_comparator, fd, &table_handle, true);
s = FindTable(env_options, internal_comparator, fd, &table_handle, true);
if (!s.ok()) {
return 0;
}

@ -19,6 +19,7 @@
#include "rocksdb/cache.h"
#include "rocksdb/env.h"
#include "rocksdb/table.h"
#include "rocksdb/options.h"
#include "table/table_reader.h"
namespace rocksdb {
@ -26,11 +27,12 @@ namespace rocksdb {
class Env;
class Arena;
struct FileDescriptor;
class GetContext;
class TableCache {
public:
TableCache(const Options* options, const EnvOptions& storage_options,
Cache* cache);
TableCache(const ImmutableCFOptions& ioptions,
const EnvOptions& storage_options, Cache* cache);
~TableCache();
// Return an iterator for the specified file number (the corresponding
@ -51,10 +53,8 @@ class TableCache {
// it returns false.
Status Get(const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileDescriptor& file_fd, const Slice& k, void* arg,
bool (*handle_result)(void*, const ParsedInternalKey&,
const Slice&),
void (*mark_key_may_exist)(void*) = nullptr);
const FileDescriptor& file_fd, const Slice& k,
GetContext* get_context);
// Evict any entry for the specified file number
static void Evict(Cache* cache, uint64_t file_number);
@ -91,10 +91,8 @@ class TableCache {
void ReleaseHandle(Cache::Handle* handle);
private:
Env* const env_;
const std::vector<DbPath> db_paths_;
const Options* options_;
const EnvOptions& storage_options_;
const ImmutableCFOptions& ioptions_;
const EnvOptions& env_options_;
Cache* const cache_;
};

@ -7,6 +7,7 @@
#include "db/dbformat.h"
#include "util/coding.h"
#include "util/string_util.h"
namespace rocksdb {
@ -40,7 +41,7 @@ Status InternalKeyPropertiesCollector::Finish(
UserCollectedProperties
InternalKeyPropertiesCollector::GetReadableProperties() const {
return {
{ "kDeletedKeys", std::to_string(deleted_keys_) }
{ "kDeletedKeys", ToString(deleted_keys_) }
};
}

@ -11,6 +11,7 @@
#include "db/dbformat.h"
#include "db/table_properties_collector.h"
#include "rocksdb/table.h"
#include "rocksdb/immutable_options.h"
#include "table/block_based_table_factory.h"
#include "table/meta_blocks.h"
#include "table/plain_table_factory.h"
@ -78,6 +79,7 @@ class FakeRandomeAccessFile : public RandomAccessFile {
class DumbLogger : public Logger {
public:
using Logger::Logv;
virtual void Logv(const char* format, va_list ap) { }
virtual size_t GetLogFileSize() const { return 0; }
};
@ -85,12 +87,14 @@ class DumbLogger : public Logger {
// Utilities test functions
namespace {
void MakeBuilder(const Options& options,
const ImmutableCFOptions& ioptions,
const InternalKeyComparator& internal_comparator,
std::unique_ptr<FakeWritableFile>* writable,
std::unique_ptr<TableBuilder>* builder) {
writable->reset(new FakeWritableFile);
builder->reset(options.table_factory->NewTableBuilder(
options, internal_comparator, writable->get(), options.compression));
builder->reset(ioptions.table_factory->NewTableBuilder(
ioptions, internal_comparator, writable->get(),
options.compression, options.compression_opts));
}
} // namespace
@ -153,7 +157,8 @@ void TestCustomizedTablePropertiesCollector(
// -- Step 1: build table
std::unique_ptr<TableBuilder> builder;
std::unique_ptr<FakeWritableFile> writable;
MakeBuilder(options, internal_comparator, &writable, &builder);
const ImmutableCFOptions ioptions(options);
MakeBuilder(options, ioptions, internal_comparator, &writable, &builder);
for (const auto& kv : kvs) {
if (encode_as_internal) {
@ -264,9 +269,10 @@ void TestInternalKeyPropertiesCollector(
options.table_properties_collector_factories = {
std::make_shared<InternalKeyPropertiesCollectorFactory>()};
}
const ImmutableCFOptions ioptions(options);
for (int iter = 0; iter < 2; ++iter) {
MakeBuilder(options, pikc, &writable, &builder);
MakeBuilder(options, ioptions, pikc, &writable, &builder);
for (const auto& k : keys) {
builder->Add(k.Encode(), "val");
}

@ -4,6 +4,11 @@
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include "db/transaction_log_impl.h"
#include "db/write_batch_internal.h"
@ -13,7 +18,7 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
const std::string& dir, const DBOptions* options,
const TransactionLogIterator::ReadOptions& read_options,
const EnvOptions& soptions, const SequenceNumber seq,
std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl)
std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions)
: dir_(dir),
options_(options),
read_options_(read_options),
@ -25,9 +30,9 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
currentFileIndex_(0),
currentBatchSeq_(0),
currentLastSeq_(0),
dbimpl_(dbimpl) {
versions_(versions) {
assert(files_ != nullptr);
assert(dbimpl_ != nullptr);
assert(versions_ != nullptr);
reporter_.env = options_->env;
reporter_.info_log = options_->info_log.get();
@ -43,14 +48,14 @@ Status TransactionLogIteratorImpl::OpenLogFile(
return env->NewSequentialFile(fname, file, soptions_);
} else {
std::string fname = LogFileName(dir_, logFile->LogNumber());
Status status = env->NewSequentialFile(fname, file, soptions_);
if (!status.ok()) {
Status s = env->NewSequentialFile(fname, file, soptions_);
if (!s.ok()) {
// If cannot open file in DB directory.
// Try the archive dir, as it could have moved in the meanwhile.
fname = ArchivedLogFileName(dir_, logFile->LogNumber());
status = env->NewSequentialFile(fname, file, soptions_);
s = env->NewSequentialFile(fname, file, soptions_);
}
return status;
return s;
}
}
@ -74,7 +79,7 @@ bool TransactionLogIteratorImpl::RestrictedRead(
Slice* record,
std::string* scratch) {
// Don't read if no more complete entries to read from logs
if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) {
if (currentLastSeq_ >= versions_->LastSequence()) {
return false;
}
return currentLogReader_->ReadRecord(record, scratch);
@ -177,15 +182,15 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) {
// Open the next file
if (currentFileIndex_ < files_->size() - 1) {
++currentFileIndex_;
Status status =OpenLogReader(files_->at(currentFileIndex_).get());
if (!status.ok()) {
Status s = OpenLogReader(files_->at(currentFileIndex_).get());
if (!s.ok()) {
isValid_ = false;
currentStatus_ = status;
currentStatus_ = s;
return;
}
} else {
isValid_ = false;
if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) {
if (currentLastSeq_ == versions_->LastSequence()) {
currentStatus_ = Status::OK();
} else {
currentStatus_ = Status::Corruption("NO MORE DATA LEFT");
@ -203,12 +208,10 @@ bool TransactionLogIteratorImpl::IsBatchExpected(
if (batchSeq != expectedSeq) {
char buf[200];
snprintf(buf, sizeof(buf),
"Discontinuity in log records. Got seq=%lu, Expected seq=%lu, "
"Last flushed seq=%lu.Log iterator will reseek the correct "
"batch.",
(unsigned long)batchSeq,
(unsigned long)expectedSeq,
(unsigned long)dbimpl_->GetLatestSequenceNumber());
"Discontinuity in log records. Got seq=%" PRIu64
", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64
".Log iterator will reseek the correct batch.",
batchSeq, expectedSeq, versions_->LastSequence());
reporter_.Info(buf);
return false;
}
@ -240,7 +243,7 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
currentLastSeq_ = currentBatchSeq_ +
WriteBatchInternal::Count(batch.get()) - 1;
// currentBatchSeq_ can only change here
assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber());
assert(currentLastSeq_ <= versions_->LastSequence());
currentBatch_ = move(batch);
isValid_ = true;
@ -249,9 +252,9 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
unique_ptr<SequentialFile> file;
Status status = OpenLogFile(logFile, &file);
if (!status.ok()) {
return status;
Status s = OpenLogFile(logFile, &file);
if (!s.ok()) {
return s;
}
assert(file);
currentLogReader_.reset(new log::Reader(std::move(file), &reporter_,

@ -11,23 +11,12 @@
#include "rocksdb/options.h"
#include "rocksdb/types.h"
#include "rocksdb/transaction_log.h"
#include "db/db_impl.h"
#include "db/version_set.h"
#include "db/log_reader.h"
#include "db/filename.h"
namespace rocksdb {
struct LogReporter : public log::Reader::Reporter {
Env* env;
Logger* info_log;
virtual void Corruption(size_t bytes, const Status& s) {
Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str());
}
virtual void Info(const char* s) {
Log(info_log, "%s", s);
}
};
class LogFileImpl : public LogFile {
public:
LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
@ -71,7 +60,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
const std::string& dir, const DBOptions* options,
const TransactionLogIterator::ReadOptions& read_options,
const EnvOptions& soptions, const SequenceNumber seqNum,
std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl);
std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions);
virtual bool Valid();
@ -95,10 +84,24 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
std::unique_ptr<WriteBatch> currentBatch_;
unique_ptr<log::Reader> currentLogReader_;
Status OpenLogFile(const LogFile* logFile, unique_ptr<SequentialFile>* file);
LogReporter reporter_;
struct LogReporter : public log::Reader::Reporter {
Env* env;
Logger* info_log;
virtual void Corruption(size_t bytes, const Status& s) {
Log(InfoLogLevel::ERROR_LEVEL, info_log, "dropping %zu bytes; %s", bytes,
s.ToString().c_str());
}
virtual void Info(const char* s) {
Log(InfoLogLevel::INFO_LEVEL, info_log, "%s", s);
}
} reporter_;
SequenceNumber currentBatchSeq_; // sequence number at start of current batch
SequenceNumber currentLastSeq_; // last sequence in the current batch
DBImpl const * const dbimpl_; // The db on whose log files this iterates
// Used only to get latest seq. num
// TODO(icanadi) can this be just a callback?
VersionSet const* const versions_;
// Reads from transaction log only if the writebatch record has been written
bool RestrictedRead(Slice* record, std::string* scratch);

@ -0,0 +1,330 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_builder.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <algorithm>
#include <set>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "db/dbformat.h"
#include "db/table_cache.h"
#include "db/version_set.h"
#include "table/table_reader.h"
namespace rocksdb {
bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
if (a->smallest_seqno != b->smallest_seqno) {
return a->smallest_seqno > b->smallest_seqno;
}
if (a->largest_seqno != b->largest_seqno) {
return a->largest_seqno > b->largest_seqno;
}
// Break ties by file number
return a->fd.GetNumber() > b->fd.GetNumber();
}
namespace {
bool BySmallestKey(FileMetaData* a, FileMetaData* b,
const InternalKeyComparator* cmp) {
int r = cmp->Compare(a->smallest, b->smallest);
if (r != 0) {
return (r < 0);
}
// Break ties by file number
return (a->fd.GetNumber() < b->fd.GetNumber());
}
} // namespace
class VersionBuilder::Rep {
private:
// Helper to sort files_ in v
// kLevel0 -- NewestFirstBySeqNo
// kLevelNon0 -- BySmallestKey
struct FileComparator {
enum SortMethod { kLevel0 = 0, kLevelNon0 = 1, } sort_method;
const InternalKeyComparator* internal_comparator;
bool operator()(FileMetaData* f1, FileMetaData* f2) const {
switch (sort_method) {
case kLevel0:
return NewestFirstBySeqNo(f1, f2);
case kLevelNon0:
return BySmallestKey(f1, f2, internal_comparator);
}
assert(false);
return false;
}
};
struct LevelState {
std::unordered_set<uint64_t> deleted_files;
// Map from file number to file meta data.
std::unordered_map<uint64_t, FileMetaData*> added_files;
};
const EnvOptions& env_options_;
TableCache* table_cache_;
VersionStorageInfo* base_vstorage_;
LevelState* levels_;
FileComparator level_zero_cmp_;
FileComparator level_nonzero_cmp_;
public:
Rep(const EnvOptions& env_options, TableCache* table_cache,
VersionStorageInfo* base_vstorage)
: env_options_(env_options),
table_cache_(table_cache),
base_vstorage_(base_vstorage) {
levels_ = new LevelState[base_vstorage_->num_levels()];
level_zero_cmp_.sort_method = FileComparator::kLevel0;
level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
level_nonzero_cmp_.internal_comparator =
base_vstorage_->InternalComparator();
}
~Rep() {
for (int level = 0; level < base_vstorage_->num_levels(); level++) {
const auto& added = levels_[level].added_files;
for (auto& pair : added) {
UnrefFile(pair.second);
}
}
delete[] levels_;
}
void UnrefFile(FileMetaData* f) {
f->refs--;
if (f->refs <= 0) {
if (f->table_reader_handle) {
assert(table_cache_ != nullptr);
table_cache_->ReleaseHandle(f->table_reader_handle);
f->table_reader_handle = nullptr;
}
delete f;
}
}
void CheckConsistency(VersionStorageInfo* vstorage) {
#ifndef NDEBUG
// make sure the files are sorted correctly
for (int level = 0; level < vstorage->num_levels(); level++) {
auto& level_files = vstorage->LevelFiles(level);
for (size_t i = 1; i < level_files.size(); i++) {
auto f1 = level_files[i - 1];
auto f2 = level_files[i];
if (level == 0) {
assert(level_zero_cmp_(f1, f2));
assert(f1->largest_seqno > f2->largest_seqno);
} else {
assert(level_nonzero_cmp_(f1, f2));
// Make sure there is no overlap in levels > 0
if (vstorage->InternalComparator()->Compare(f1->largest,
f2->smallest) >= 0) {
fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
(f1->largest).DebugString().c_str(),
(f2->smallest).DebugString().c_str());
abort();
}
}
}
}
#endif
}
void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
int level) {
#ifndef NDEBUG
// a file to be deleted better exist in the previous version
bool found = false;
for (int l = 0; !found && l < base_vstorage_->num_levels(); l++) {
const std::vector<FileMetaData*>& base_files =
base_vstorage_->LevelFiles(l);
for (unsigned int i = 0; i < base_files.size(); i++) {
FileMetaData* f = base_files[i];
if (f->fd.GetNumber() == number) {
found = true;
break;
}
}
}
// if the file did not exist in the previous version, then it
// is possibly moved from lower level to higher level in current
// version
for (int l = level + 1; !found && l < base_vstorage_->num_levels(); l++) {
auto& level_added = levels_[l].added_files;
auto got = level_added.find(number);
if (got != level_added.end()) {
found = true;
break;
}
}
// maybe this file was added in a previous edit that was Applied
if (!found) {
auto& level_added = levels_[level].added_files;
auto got = level_added.find(number);
if (got != level_added.end()) {
found = true;
}
}
if (!found) {
fprintf(stderr, "not found %" PRIu64 "\n", number);
}
assert(found);
#endif
}
// Apply all of the edits in *edit to the current state.
void Apply(VersionEdit* edit) {
CheckConsistency(base_vstorage_);
// Delete files
const VersionEdit::DeletedFileSet& del = edit->GetDeletedFiles();
for (const auto& del_file : del) {
const auto level = del_file.first;
const auto number = del_file.second;
levels_[level].deleted_files.insert(number);
CheckConsistencyForDeletes(edit, number, level);
auto exising = levels_[level].added_files.find(number);
if (exising != levels_[level].added_files.end()) {
UnrefFile(exising->second);
levels_[level].added_files.erase(number);
}
}
// Add new files
for (const auto& new_file : edit->GetNewFiles()) {
const int level = new_file.first;
FileMetaData* f = new FileMetaData(new_file.second);
f->refs = 1;
assert(levels_[level].added_files.find(f->fd.GetNumber()) ==
levels_[level].added_files.end());
levels_[level].deleted_files.erase(f->fd.GetNumber());
levels_[level].added_files[f->fd.GetNumber()] = f;
}
}
// Save the current state in *v.
void SaveTo(VersionStorageInfo* vstorage) {
CheckConsistency(base_vstorage_);
CheckConsistency(vstorage);
for (int level = 0; level < base_vstorage_->num_levels(); level++) {
const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
// Merge the set of added files with the set of pre-existing files.
// Drop any deleted files. Store the result in *v.
const auto& base_files = base_vstorage_->LevelFiles(level);
auto base_iter = base_files.begin();
auto base_end = base_files.end();
const auto& unordered_added_files = levels_[level].added_files;
vstorage->Reserve(level,
base_files.size() + unordered_added_files.size());
// Sort added files for the level.
std::vector<FileMetaData*> added_files;
added_files.reserve(unordered_added_files.size());
for (const auto& pair : unordered_added_files) {
added_files.push_back(pair.second);
}
std::sort(added_files.begin(), added_files.end(), cmp);
#ifndef NDEBUG
FileMetaData* prev_file = nullptr;
#endif
for (const auto& added : added_files) {
#ifndef NDEBUG
if (level > 0 && prev_file != nullptr) {
assert(base_vstorage_->InternalComparator()->Compare(
prev_file->smallest, added->smallest) <= 0);
}
prev_file = added;
#endif
// Add all smaller files listed in base_
for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
base_iter != bpos; ++base_iter) {
MaybeAddFile(vstorage, level, *base_iter);
}
MaybeAddFile(vstorage, level, added);
}
// Add remaining base files
for (; base_iter != base_end; ++base_iter) {
MaybeAddFile(vstorage, level, *base_iter);
}
}
CheckConsistency(vstorage);
}
void LoadTableHandlers() {
assert(table_cache_ != nullptr);
for (int level = 0; level < base_vstorage_->num_levels(); level++) {
for (auto& file_meta_pair : levels_[level].added_files) {
auto* file_meta = file_meta_pair.second;
assert(!file_meta->table_reader_handle);
table_cache_->FindTable(
env_options_, *(base_vstorage_->InternalComparator()),
file_meta->fd, &file_meta->table_reader_handle, false);
if (file_meta->table_reader_handle != nullptr) {
// Load table_reader
file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
file_meta->table_reader_handle);
}
}
}
}
void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) {
if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
// File is deleted: do nothing
} else {
vstorage->AddFile(level, f);
}
}
};
VersionBuilder::VersionBuilder(const EnvOptions& env_options,
TableCache* table_cache,
VersionStorageInfo* base_vstorage)
: rep_(new Rep(env_options, table_cache, base_vstorage)) {}
VersionBuilder::~VersionBuilder() { delete rep_; }
void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
rep_->CheckConsistency(vstorage);
}
void VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
uint64_t number, int level) {
rep_->CheckConsistencyForDeletes(edit, number, level);
}
void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); }
void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
rep_->SaveTo(vstorage);
}
void VersionBuilder::LoadTableHandlers() { rep_->LoadTableHandlers(); }
void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level,
FileMetaData* f) {
rep_->MaybeAddFile(vstorage, level, f);
}
} // namespace rocksdb

@ -0,0 +1,42 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
#pragma once
#include "rocksdb/env.h"
namespace rocksdb {
class TableCache;
class VersionStorageInfo;
class VersionEdit;
struct FileMetaData;
// A helper class so we can efficiently apply a whole sequence
// of edits to a particular state without creating intermediate
// Versions that contain full copies of the intermediate state.
class VersionBuilder {
public:
VersionBuilder(const EnvOptions& env_options, TableCache* table_cache,
VersionStorageInfo* base_vstorage);
~VersionBuilder();
void CheckConsistency(VersionStorageInfo* vstorage);
void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
int level);
void Apply(VersionEdit* edit);
void SaveTo(VersionStorageInfo* vstorage);
void LoadTableHandlers();
void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f);
private:
class Rep;
Rep* rep_;
};
extern bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b);
} // namespace rocksdb

@ -0,0 +1,229 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <string>
#include "db/version_edit.h"
#include "db/version_set.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace rocksdb {
class VersionBuilderTest {
public:
const Comparator* ucmp_;
InternalKeyComparator icmp_;
Options options_;
ImmutableCFOptions ioptions_;
MutableCFOptions mutable_cf_options_;
VersionStorageInfo vstorage_;
uint32_t file_num_;
CompactionOptionsFIFO fifo_options_;
std::vector<uint64_t> size_being_compacted_;
VersionBuilderTest()
: ucmp_(BytewiseComparator()),
icmp_(ucmp_),
ioptions_(options_),
mutable_cf_options_(options_, ioptions_),
vstorage_(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel,
nullptr),
file_num_(1) {
mutable_cf_options_.RefreshDerivedOptions(ioptions_);
size_being_compacted_.resize(options_.num_levels);
}
~VersionBuilderTest() {
for (int i = 0; i < vstorage_.num_levels(); i++) {
for (auto* f : vstorage_.LevelFiles(i)) {
if (--f->refs == 0) {
delete f;
}
}
}
}
InternalKey GetInternalKey(const char* ukey,
SequenceNumber smallest_seq = 100) {
return InternalKey(ukey, smallest_seq, kTypeValue);
}
void Add(int level, uint32_t file_number, const char* smallest,
const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
SequenceNumber smallest_seq = 100,
SequenceNumber largest_seq = 100,
uint64_t num_entries = 0, uint64_t num_deletions = 0,
bool sampled = false) {
assert(level < vstorage_.num_levels());
FileMetaData* f = new FileMetaData;
f->fd = FileDescriptor(file_number, path_id, file_size);
f->smallest = GetInternalKey(smallest, smallest_seq);
f->largest = GetInternalKey(largest, largest_seq);
f->compensated_file_size = file_size;
f->refs = 0;
f->num_entries = num_entries;
f->num_deletions = num_deletions;
vstorage_.AddFile(level, f);
if (sampled) {
f->init_stats_from_file = true;
vstorage_.UpdateAccumulatedStats(f);
}
}
void UpdateVersionStorageInfo() {
vstorage_.UpdateFilesBySize();
vstorage_.UpdateNumNonEmptyLevels();
vstorage_.GenerateFileIndexer();
vstorage_.GenerateLevelFilesBrief();
vstorage_.SetFinalized();
}
};
TEST(VersionBuilderTest, ApplyAndSaveTo) {
Add(0, 1U, "150", "200", 100U);
// Level 1 score 1.2
Add(1, 66U, "150", "200", 100U);
Add(1, 88U, "201", "300", 100U);
// Level 2 score 1.8. File 7 is the largest. Should be picked
Add(2, 6U, "150", "179", 100U);
Add(2, 7U, "180", "220", 100U);
Add(2, 8U, "221", "300", 100U);
// Level 3 score slightly larger than 1
Add(3, 26U, "150", "170", 100U);
Add(3, 27U, "171", "179", 100U);
Add(3, 28U, "191", "220", 100U);
Add(3, 29U, "221", "300", 100U);
UpdateVersionStorageInfo();
VersionEdit version_edit;
version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
GetInternalKey("350"), 200, 200);
version_edit.DeleteFile(3, 27U);
EnvOptions env_options;
VersionBuilder version_builder(env_options, nullptr, &vstorage_);
VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
kCompactionStyleLevel, nullptr);
version_builder.Apply(&version_edit);
version_builder.SaveTo(&new_vstorage);
ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2));
ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3));
for (int i = 0; i < new_vstorage.num_levels(); i++) {
for (auto* f : new_vstorage.LevelFiles(i)) {
if (--f->refs == 0) {
delete f;
}
}
}
}
TEST(VersionBuilderTest, ApplyMultipleAndSaveTo) {
UpdateVersionStorageInfo();
VersionEdit version_edit;
version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
GetInternalKey("350"), 200, 200);
version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
GetInternalKey("450"), 200, 200);
version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
GetInternalKey("650"), 200, 200);
version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
GetInternalKey("550"), 200, 200);
version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
GetInternalKey("750"), 200, 200);
EnvOptions env_options;
VersionBuilder version_builder(env_options, nullptr, &vstorage_);
VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
kCompactionStyleLevel, nullptr);
version_builder.Apply(&version_edit);
version_builder.SaveTo(&new_vstorage);
ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2));
for (int i = 0; i < new_vstorage.num_levels(); i++) {
for (auto* f : new_vstorage.LevelFiles(i)) {
if (--f->refs == 0) {
delete f;
}
}
}
}
TEST(VersionBuilderTest, ApplyDeleteAndSaveTo) {
UpdateVersionStorageInfo();
EnvOptions env_options;
VersionBuilder version_builder(env_options, nullptr, &vstorage_);
VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
kCompactionStyleLevel, nullptr);
VersionEdit version_edit;
version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
GetInternalKey("350"), 200, 200);
version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
GetInternalKey("450"), 200, 200);
version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
GetInternalKey("650"), 200, 200);
version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
GetInternalKey("550"), 200, 200);
version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
GetInternalKey("750"), 200, 200);
version_builder.Apply(&version_edit);
VersionEdit version_edit2;
version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
GetInternalKey("950"), 200, 200);
version_edit2.DeleteFile(2, 616);
version_edit2.DeleteFile(2, 636);
version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
GetInternalKey("850"), 200, 200);
version_builder.Apply(&version_edit2);
version_builder.SaveTo(&new_vstorage);
ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2));
for (int i = 0; i < new_vstorage.num_levels(); i++) {
for (auto* f : new_vstorage.LevelFiles(i)) {
if (--f->refs == 0) {
delete f;
}
}
}
}
TEST(VersionBuilderTest, EstimatedActiveKeys) {
const uint32_t kTotalSamples = 20;
const uint32_t kNumLevels = 5;
const uint32_t kFilesPerLevel = 8;
const uint32_t kNumFiles = kNumLevels * kFilesPerLevel;
const uint32_t kEntriesPerFile = 1000;
const uint32_t kDeletionsPerFile = 100;
for (uint32_t i = 0; i < kNumFiles; ++i) {
Add(static_cast<int>(i / kFilesPerLevel), i + 1,
ToString((i + 100) * 1000).c_str(),
ToString((i + 100) * 1000 + 999).c_str(),
100U, 0, 100, 100,
kEntriesPerFile, kDeletionsPerFile,
(i < kTotalSamples));
}
// minus 2X for the number of deletion entries because:
// 1x for deletion entry does not count as a data entry.
// 1x for each deletion entry will actually remove one data entry.
ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(),
(kEntriesPerFile - 2 * kDeletionsPerFile) * kNumFiles);
}
} // namespace rocksdb
int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

@ -64,7 +64,7 @@ void VersionEdit::Clear() {
column_family_name_.clear();
}
void VersionEdit::EncodeTo(std::string* dst) const {
bool VersionEdit::EncodeTo(std::string* dst) const {
if (has_comparator_) {
PutVarint32(dst, kComparator);
PutLengthPrefixedSlice(dst, comparator_);
@ -98,6 +98,9 @@ void VersionEdit::EncodeTo(std::string* dst) const {
for (size_t i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
if (!f.smallest.Valid() || !f.largest.Valid()) {
return false;
}
if (f.fd.GetPathId() == 0) {
// Use older format to make sure user can roll back the build if they
// don't config multiple DB paths.
@ -131,6 +134,7 @@ void VersionEdit::EncodeTo(std::string* dst) const {
if (is_column_family_drop_) {
PutVarint32(dst, kColumnFamilyDrop);
}
return true;
}
static bool GetInternalKey(Slice* input, InternalKey* dst) {
@ -164,7 +168,6 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
// Temporary storage for parsing
int level;
uint64_t number;
FileMetaData f;
Slice str;
InternalKey key;
@ -233,9 +236,9 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
}
break;
case kDeletedFile:
if (GetLevel(&input, &level, &msg) &&
GetVarint64(&input, &number)) {
case kDeletedFile: {
uint64_t number;
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
deleted_files_.insert(std::make_pair(level, number));
} else {
if (!msg) {
@ -243,6 +246,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
}
}
break;
}
case kNewFile: {
uint64_t number;
@ -293,7 +297,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
new_files_.push_back(std::make_pair(level, f));
} else {
if (!msg) {
msg = "new-file2 entry";
msg = "new-file3 entry";
}
}
break;

@ -8,6 +8,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <algorithm>
#include <set>
#include <utility>
#include <vector>
@ -38,10 +39,10 @@ struct FileDescriptor {
FileDescriptor() : FileDescriptor(0, 0, 0) {}
FileDescriptor(uint64_t number, uint32_t path_id, uint64_t file_size)
FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
: table_reader(nullptr),
packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
file_size(file_size) {}
file_size(_file_size) {}
FileDescriptor& operator=(const FileDescriptor& fd) {
table_reader = fd.table_reader;
@ -74,9 +75,11 @@ struct FileMetaData {
// Stats for compensating deletion entries during compaction
// File size compensated by deletion entry.
// This is updated in Version::UpdateTemporaryStats() first time when the
// file is created or loaded. After it is updated, it is immutable.
// This is updated in Version::UpdateAccumulatedStats() first time when the
// file is created or loaded. After it is updated (!= 0), it is immutable.
uint64_t compensated_file_size;
// These values can mutate, but they can only be read or written from
// single-threaded LogAndApply thread
uint64_t num_entries; // the number of entries.
uint64_t num_deletions; // the number of deletion entries.
uint64_t raw_key_size; // total uncompressed key size.
@ -109,20 +112,16 @@ struct FdWithKeyRange {
largest_key() {
}
FdWithKeyRange(FileDescriptor fd,
Slice smallest_key, Slice largest_key)
: fd(fd),
smallest_key(smallest_key),
largest_key(largest_key) {
}
FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key)
: fd(_fd), smallest_key(_smallest_key), largest_key(_largest_key) {}
};
// Data structure to store an array of FdWithKeyRange in one level
// Actual data is guaranteed to be stored closely
struct FileLevel {
struct LevelFilesBrief {
size_t num_files;
FdWithKeyRange* files;
FileLevel() {
LevelFilesBrief() {
num_files = 0;
files = nullptr;
}
@ -163,13 +162,13 @@ class VersionEdit {
// Add the specified file at the specified number.
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
void AddFile(int level, uint64_t file, uint64_t file_size,
uint64_t file_path_id, const InternalKey& smallest,
void AddFile(int level, uint64_t file, uint32_t file_path_id,
uint64_t file_size, const InternalKey& smallest,
const InternalKey& largest, const SequenceNumber& smallest_seqno,
const SequenceNumber& largest_seqno) {
assert(smallest_seqno <= largest_seqno);
FileMetaData f;
f.fd = FileDescriptor(file, file_size, file_path_id);
f.fd = FileDescriptor(file, file_path_id, file_size);
f.smallest = smallest;
f.largest = largest;
f.smallest_seqno = smallest_seqno;
@ -183,9 +182,7 @@ class VersionEdit {
}
// Number of edits
int NumEntries() {
return new_files_.size() + deleted_files_.size();
}
size_t NumEntries() { return new_files_.size() + deleted_files_.size(); }
bool IsColumnFamilyManipulation() {
return is_column_family_add_ || is_column_family_drop_;
@ -212,17 +209,23 @@ class VersionEdit {
is_column_family_drop_ = true;
}
void EncodeTo(std::string* dst) const;
// return true on success.
bool EncodeTo(std::string* dst) const;
Status DecodeFrom(const Slice& src);
typedef std::set<std::pair<int, uint64_t>> DeletedFileSet;
const DeletedFileSet& GetDeletedFiles() { return deleted_files_; }
const std::vector<std::pair<int, FileMetaData>>& GetNewFiles() {
return new_files_;
}
std::string DebugString(bool hex_key = false) const;
private:
friend class VersionSet;
friend class Version;
typedef std::set< std::pair<int, uint64_t>> DeletedFileSet;
bool GetLevel(Slice* input, int* level, const char** msg);
int max_level_;

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save